linux-cluster-dlm.patch

   1 # Add DLM to the build system
   2 diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
   3 --- linux-2.6.7/cluster/Kconfig 2004-06-17 15:00:36.000000000 +0800
   4 +++ linux/cluster/Kconfig       2004-06-17 15:00:57.000000000 +0800
   5 @@ -10,4 +10,22 @@ config CLUSTER
   6         needed by all the other components. It provides membership services
   7         for those other subsystems.
   8
   9 +config CLUSTER_DLM
  10 +       tristate "Distributed Lock Manager"
  11 +       depends on CLUSTER
  12 +       ---help---
  13 +       A fully distributed lock manager, providing cluster-wide locking services
  14 +       and protected lock namespaces for kernel and userland applications.
  15 +
  16 +config CLUSTER_DLM_PROCLOCKS
  17 +       boolean "/proc/locks support for DLM"
  18 +       depends on CLUSTER_DLM
  19 +       depends on PROC_FS
  20 +       ---help---
  21 +       If this option is enabled a file will appear in /proc/cluster/dlm_locks.
  22 +       write into this "file" the name of a lockspace known to the DLM and then
  23 +       read out a list of all the resources and locks in that lockspace that are
  24 +       known to the local node. Note because the DLM is distributed this may not
  25 +       be the full lock picture.
  26 +
  27  endmenu
  28 diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
  29 --- linux-2.6.7/cluster/Makefile        2004-06-17 15:00:36.000000000 +0800
  30 +++ linux/cluster/Makefile      2004-06-17 15:00:57.000000000 +0800
  31 @@ -1,3 +1,4 @@
  32  obj-y  := nocluster.o
  33
  34  obj-$(CONFIG_CLUSTER)         += cman/
  35 +obj-$(CONFIG_CLUSTER_DLM)     += dlm/
  36 diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
  37 --- linux-2.6.7/cluster/dlm/Makefile    1970-01-01 07:30:00.000000000 +0730
  38 +++ linux/cluster/dlm/Makefile  2004-06-17 15:00:57.000000000 +0800
  39 @@ -0,0 +1,23 @@
  40 +dlm-objs                 :=    ast.o \
  41 +                               config.o \
  42 +                               device.o \
  43 +                               dir.o \
  44 +                               lkb.o \
  45 +                               locking.o \
  46 +                               lockqueue.o \
  47 +                               lockspace.o \
  48 +                               lowcomms.o \
  49 +                               main.o \
  50 +                               memory.o \
  51 +                               midcomms.o \
  52 +                               nodes.o \
  53 +                               proc.o \
  54 +                               queries.o \
  55 +                               rebuild.o \
  56 +                               reccomms.o \
  57 +                               recover.o \
  58 +                               recoverd.o \
  59 +                               rsb.o \
  60 +                               util.o \
  61 +
  62 +obj-$(CONFIG_CLUSTER_DLM) += dlm.o
  63 diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
  64 --- linux-orig/cluster/dlm/ast.c        1970-01-01 07:30:00.000000000 +0730
  65 +++ linux-patched/cluster/dlm/ast.c     2004-06-25 18:31:07.000000000 +0800
  66 @@ -0,0 +1,581 @@
  67 +/******************************************************************************
  68 +*******************************************************************************
  69 +**
  70 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
  71 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
  72 +**
  73 +**  This copyrighted material is made available to anyone wishing to use,
  74 +**  modify, copy, or redistribute it subject to the terms and conditions
  75 +**  of the GNU General Public License v.2.
  76 +**
  77 +*******************************************************************************
  78 +******************************************************************************/
  79 +
  80 +/*
  81 + * This delivers ASTs and checks for dead remote requests and deadlocks.
  82 + */
  83 +
  84 +#include <linux/timer.h>
  85 +
  86 +#include "dlm_internal.h"
  87 +#include "rsb.h"
  88 +#include "lockqueue.h"
  89 +#include "dir.h"
  90 +#include "locking.h"
  91 +#include "lkb.h"
  92 +#include "lowcomms.h"
  93 +#include "midcomms.h"
  94 +#include "ast.h"
  95 +#include "nodes.h"
  96 +#include "config.h"
  97 +
  98 +/* Wake up flags for astd */
  99 +#define GDLMD_WAKE_ASTS  1
 100 +#define GDLMD_WAKE_TIMER 2
 101 +
 102 +static struct list_head _deadlockqueue;
 103 +static struct semaphore _deadlockqueue_lock;
 104 +static struct list_head _lockqueue;
 105 +static struct semaphore _lockqueue_lock;
 106 +static struct timer_list _lockqueue_timer;
 107 +static struct list_head _ast_queue;
 108 +static struct semaphore _ast_queue_lock;
 109 +static wait_queue_head_t _astd_waitchan;
 110 +static atomic_t _astd_running;
 111 +static long _astd_pid;
 112 +static unsigned long _astd_wakeflags;
 113 +static struct completion _astd_done;
 114 +
 115 +void add_to_lockqueue(gd_lkb_t *lkb)
 116 +{
 117 +       /* Time stamp the entry so we know if it's been waiting too long */
 118 +       lkb->lkb_lockqueue_time = jiffies;
 119 +
 120 +       down(&_lockqueue_lock);
 121 +       list_add(&lkb->lkb_lockqueue, &_lockqueue);
 122 +       up(&_lockqueue_lock);
 123 +}
 124 +
 125 +void remove_from_lockqueue(gd_lkb_t *lkb)
 126 +{
 127 +       down(&_lockqueue_lock);
 128 +       list_del(&lkb->lkb_lockqueue);
 129 +       up(&_lockqueue_lock);
 130 +}
 131 +
 132 +void add_to_deadlockqueue(gd_lkb_t *lkb)
 133 +{
 134 +       if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
 135 +               return;
 136 +       lkb->lkb_duetime = jiffies;
 137 +       down(&_deadlockqueue_lock);
 138 +       list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
 139 +       up(&_deadlockqueue_lock);
 140 +}
 141 +
 142 +void remove_from_deadlockqueue(gd_lkb_t *lkb)
 143 +{
 144 +       if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
 145 +               return;
 146 +
 147 +       down(&_deadlockqueue_lock);
 148 +       list_del(&lkb->lkb_deadlockq);
 149 +       up(&_deadlockqueue_lock);
 150 +
 151 +       /* Invalidate the due time */
 152 +       memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
 153 +}
 154 +
 155 +void remove_from_astqueue(gd_lkb_t *lkb)
 156 +{
 157 +       down(&_ast_queue_lock);
 158 +       if (lkb->lkb_asts_to_deliver)
 159 +               list_del(&lkb->lkb_astqueue);
 160 +       lkb->lkb_asts_to_deliver = 0;
 161 +       up(&_ast_queue_lock);
 162 +}
 163 +
 164 +/*
 165 + * Actually deliver an AST to a user. The caller MUST hold the ast queue lock
 166 + * and we unlock it for the duration of the user call, otherwise things can
 167 + * deadlock.
 168 + */
 169 +
 170 +static void deliver_ast(gd_lkb_t *lkb, gd_ast_type_t astt)
 171 +{
 172 +       void (*cast) (long param) = lkb->lkb_astaddr;
 173 +       void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
 174 +
 175 +       up(&_ast_queue_lock);
 176 +
 177 +       if (cast && (astt == GDLM_QUEUE_COMPAST))
 178 +               cast(lkb->lkb_astparam);
 179 +
 180 +       else if (bast && (astt == GDLM_QUEUE_BLKAST)
 181 +                && (lkb->lkb_status == GDLM_LKSTS_GRANTED))
 182 +               bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
 183 +
 184 +       /*
 185 +        * Remove LKB if requested.  It is up to the caller to remove the LKB
 186 +        * from any resource queue it may be on.
 187 +        *
 188 +        * NOTE: we check lkb_asts_to_deliver here in case an ast for us was
 189 +        * queued during the AST delivery itself (eg a user called dlm_unlock
 190 +        * in the AST routine!
 191 +        */
 192 +
 193 +       if (lkb->lkb_flags & GDLM_LKFLG_DELAST && astt == GDLM_QUEUE_COMPAST &&
 194 +           lkb->lkb_asts_to_deliver == 0) {
 195 +               gd_res_t *rsb = lkb->lkb_resource;
 196 +               struct rw_semaphore *in_recovery = &rsb->res_ls->ls_in_recovery;
 197 +
 198 +               down_read(in_recovery);
 199 +               release_lkb(rsb->res_ls, lkb);
 200 +               release_rsb(rsb);
 201 +               up_read(in_recovery);
 202 +       }
 203 +
 204 +       /* This queue can get very big so we schedule here to give the rest of
 205 +        * the cluster chance to do some work. */
 206 +       schedule();
 207 +
 208 +       down(&_ast_queue_lock);
 209 +}
 210 +
 211 +/*
 212 + * Queue an AST for delivery, this will only deal with
 213 + * kernel ASTs, usermode API will piggyback on top of this.
 214 + *
 215 + * This can be called in either the user or DLM context.
 216 + * ASTs are queued EVEN IF we are already running in gdlm_astd
 217 + * context as we don't know what other locks are held (eg we could
 218 + * be being called from a lock operation that was called from
 219 + * another AST!
 220 + * If the AST is to be queued remotely then a message is sent to
 221 + * the target system via midcomms.
 222 + */
 223 +
 224 +void queue_ast(gd_lkb_t *lkb, gd_ast_type_t astt, uint8_t rqmode)
 225 +{
 226 +       struct gd_remlockrequest req;
 227 +
 228 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
 229 +               /*
 230 +                * Send a message to have an ast queued remotely.  Note: we do
 231 +                * not send remote completion asts, they are handled as part of
 232 +                * remote lock granting.
 233 +                */
 234 +
 235 +               if (astt == GDLM_QUEUE_BLKAST) {
 236 +                       req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
 237 +                       req.rr_header.rh_length = sizeof(req);
 238 +                       req.rr_header.rh_flags = 0;
 239 +                       req.rr_header.rh_lkid = lkb->lkb_id;
 240 +                       req.rr_header.rh_lockspace =
 241 +                           lkb->lkb_resource->res_ls->ls_global_id;
 242 +                       req.rr_status = lkb->lkb_retstatus;
 243 +                       req.rr_remlkid = lkb->lkb_remid;
 244 +                       req.rr_rqmode = rqmode;
 245 +
 246 +                       midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
 247 +                                             lkb->lkb_resource->res_ls->ls_allocation);
 248 +
 249 +               } else if (lkb->lkb_retstatus == -EDEADLOCK) {
 250 +                       /*
 251 +                        * We only queue remote Completion ASTs here for error
 252 +                        * completions that happen out of band.
 253 +                        * DEADLOCK is one such.
 254 +                        */
 255 +
 256 +                       req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
 257 +                       req.rr_header.rh_length = sizeof(req);
 258 +                       req.rr_header.rh_flags = 0;
 259 +                       req.rr_header.rh_lkid = lkb->lkb_id;
 260 +                       req.rr_header.rh_lockspace =
 261 +                           lkb->lkb_resource->res_ls->ls_global_id;
 262 +                       req.rr_status = lkb->lkb_retstatus;
 263 +                       req.rr_remlkid = lkb->lkb_remid;
 264 +                       req.rr_rqmode = rqmode;
 265 +
 266 +                       midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
 267 +                                             lkb->lkb_resource->res_ls->ls_allocation);
 268 +               }
 269 +       } else {
 270 +               /*
 271 +                * Prepare info which will be returned in ast/bast.
 272 +                */
 273 +
 274 +               if (astt == GDLM_QUEUE_BLKAST) {
 275 +                       lkb->lkb_bastmode = rqmode;
 276 +               } else {
 277 +                       lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
 278 +
 279 +                       if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
 280 +                               lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
 281 +                       else
 282 +                               lkb->lkb_lksb->sb_flags = 0;
 283 +               }
 284 +
 285 +               /*
 286 +                * Queue ast/bast or deliver directly.  astd can deliver ASTs
 287 +                * during deadlock detection or lock timeouts.
 288 +                */
 289 +
 290 +               down(&_ast_queue_lock);
 291 +
 292 +               if (!lkb->lkb_asts_to_deliver)
 293 +                       list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
 294 +               lkb->lkb_asts_to_deliver |= astt;
 295 +
 296 +               up(&_ast_queue_lock);
 297 +
 298 +               /* It is the responsibility of the caller to call wake_astd()
 299 +                * after it has finished other locking operations that request
 300 +                * the ASTs to be delivered after */
 301 +       }
 302 +}
 303 +
 304 +/*
 305 + * Process any LKBs on the AST queue.  The were queued in queue_ast().
 306 + */
 307 +
 308 +static void process_asts(void)
 309 +{
 310 +       gd_lkb_t *lkb, *safe;
 311 +       uint32_t to_deliver;
 312 +
 313 +       down(&_ast_queue_lock);
 314 +
 315 +       list_for_each_entry_safe(lkb, safe, &_ast_queue, lkb_astqueue) {
 316 +
 317 +               /* The lkb can be placed back on _ast_queue as soon as
 318 +                * _ast_queue_lock is released. */
 319 +
 320 +               to_deliver = lkb->lkb_asts_to_deliver;
 321 +               lkb->lkb_asts_to_deliver = 0;
 322 +               list_del(&lkb->lkb_astqueue);
 323 +
 324 +               if ((to_deliver & GDLM_QUEUE_COMPAST))
 325 +                       deliver_ast(lkb, GDLM_QUEUE_COMPAST);
 326 +
 327 +               if ((to_deliver & GDLM_QUEUE_BLKAST))
 328 +                       deliver_ast(lkb, GDLM_QUEUE_BLKAST);
 329 +       }
 330 +       up(&_ast_queue_lock);
 331 +}
 332 +
 333 +void lockqueue_lkb_mark(gd_ls_t *ls)
 334 +{
 335 +       gd_lkb_t *lkb, *safe;
 336 +       int count = 0;
 337 +
 338 +       log_all(ls, "mark waiting requests");
 339 +
 340 +       down(&_lockqueue_lock);
 341 +
 342 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 343 +
 344 +               if (lkb->lkb_resource->res_ls != ls)
 345 +                       continue;
 346 +
 347 +               /*
 348 +                * These lkb's are new and the master is being looked up.  Mark
 349 +                * the lkb request to be resent.  Even if the destination node
 350 +                * for the request is still living and has our request, it will
 351 +                * purge all resdir requests in purge_requestqueue.  If there's
 352 +                * a reply to the LOOKUP request in our requestqueue (the reply
 353 +                * arrived after ls_stop), it is invalid and will be discarded
 354 +                * in purge_requestqueue, too.
 355 +                */
 356 +
 357 +               if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
 358 +                       GDLM_ASSERT(lkb->lkb_nodeid == -1,
 359 +                                   log_error(ls, "nodeid=%d\n",
 360 +                                             lkb->lkb_nodeid););
 361 +
 362 +                       lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
 363 +                       count++;
 364 +                       continue;
 365 +               }
 366 +
 367 +               /*
 368 +                * These lkb's have an outstanding request to a bygone node.
 369 +                * The request will be redirected to the new master node in
 370 +                * resend_cluster_requests().  Don't mark the request for
 371 +                * resending if there's a reply for it saved in the
 372 +                * requestqueue.
 373 +                */
 374 +
 375 +               if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
 376 +                   !reply_in_requestqueue(ls, lkb->lkb_id)) {
 377 +
 378 +                       lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
 379 +
 380 +                       /*
 381 +                        * Don't rebuild this lkb on a new rsb in
 382 +                        * rebuild_rsbs_send().
 383 +                        */
 384 +
 385 +                       if (lkb->lkb_lockqueue_state ==
 386 +                           GDLM_LQSTATE_WAIT_CONDGRANT) {
 387 +                               GDLM_ASSERT(lkb->lkb_status ==
 388 +                                           GDLM_LKSTS_WAITING, );
 389 +                               lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
 390 +                       }
 391 +
 392 +                       /*
 393 +                        * This flag indicates to the new master that his lkb
 394 +                        * is in the midst of a convert request and should be
 395 +                        * placed on the granted queue rather than the convert
 396 +                        * queue.  We will resend this convert request to the
 397 +                        * new master.
 398 +                        */
 399 +
 400 +                       else if (lkb->lkb_lockqueue_state ==
 401 +                                GDLM_LQSTATE_WAIT_CONVERT) {
 402 +                               GDLM_ASSERT(lkb->lkb_status ==
 403 +                                           GDLM_LKSTS_CONVERT, );
 404 +                               lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
 405 +                       }
 406 +
 407 +                       count++;
 408 +               }
 409 +       }
 410 +       up(&_lockqueue_lock);
 411 +
 412 +       log_all(ls, "marked %d requests", count);
 413 +}
 414 +
 415 +int resend_cluster_requests(gd_ls_t *ls)
 416 +{
 417 +       gd_lkb_t *lkb, *safe;
 418 +       int error = 0, state, count = 0;
 419 +
 420 +       log_all(ls, "resend marked requests");
 421 +
 422 +       down(&_lockqueue_lock);
 423 +
 424 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 425 +
 426 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
 427 +                       log_debug(ls, "resend_cluster_requests: aborted");
 428 +                       error = -EINTR;
 429 +                       break;
 430 +               }
 431 +
 432 +               if (lkb->lkb_resource->res_ls != ls)
 433 +                       continue;
 434 +
 435 +               log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
 436 +                         "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
 437 +                         lkb->lkb_lockqueue_state, lkb->lkb_flags);
 438 +
 439 +               /*
 440 +                * Resend/process the lockqueue lkb's (in-progres requests)
 441 +                * that were flagged at the start of recovery in
 442 +                * lockqueue_lkb_mark().
 443 +                */
 444 +
 445 +               if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
 446 +                       lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
 447 +                       lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
 448 +                       lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
 449 +
 450 +                       if (lkb->lkb_nodeid == -1) {
 451 +                               /*
 452 +                                * Send lookup to new resdir node.
 453 +                                */
 454 +                               lkb->lkb_lockqueue_time = jiffies;
 455 +                               send_cluster_request(lkb,
 456 +                                                    lkb->lkb_lockqueue_state);
 457 +                       }
 458 +
 459 +                       else if (lkb->lkb_nodeid != 0) {
 460 +                               /*
 461 +                                * There's a new RSB master (that's not us.)
 462 +                                */
 463 +                               lkb->lkb_lockqueue_time = jiffies;
 464 +                               send_cluster_request(lkb,
 465 +                                                    lkb->lkb_lockqueue_state);
 466 +                       }
 467 +
 468 +                       else {
 469 +                               /*
 470 +                                * We are the new RSB master for this lkb
 471 +                                * request.
 472 +                                */
 473 +                               state = lkb->lkb_lockqueue_state;
 474 +                               lkb->lkb_lockqueue_state = 0;
 475 +                               /* list_del equals remove_from_lockqueue() */
 476 +                               list_del(&lkb->lkb_lockqueue);
 477 +                               process_remastered_lkb(lkb, state);
 478 +                       }
 479 +
 480 +                       count++;
 481 +               }
 482 +       }
 483 +       up(&_lockqueue_lock);
 484 +
 485 +       log_all(ls, "resent %d requests", count);
 486 +       return error;
 487 +}
 488 +
 489 +/*
 490 + * Process any LKBs on the Lock queue, this
 491 + * just looks at the entries to see if they have been
 492 + * on the queue too long and fails the requests if so.
 493 + */
 494 +
 495 +static void process_lockqueue(void)
 496 +{
 497 +       gd_lkb_t *lkb, *safe;
 498 +       gd_ls_t *ls;
 499 +       int count = 0;
 500 +
 501 +       down(&_lockqueue_lock);
 502 +
 503 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 504 +               ls = lkb->lkb_resource->res_ls;
 505 +
 506 +               if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
 507 +                       continue;
 508 +
 509 +               /* Don't time out locks that are in transition */
 510 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
 511 +                       continue;
 512 +
 513 +               if (check_timeout(lkb->lkb_lockqueue_time,
 514 +                                 dlm_config.lock_timeout)) {
 515 +                       count++;
 516 +                       list_del(&lkb->lkb_lockqueue);
 517 +                       up(&_lockqueue_lock);
 518 +                       cancel_lockop(lkb, -ETIMEDOUT);
 519 +                       down(&_lockqueue_lock);
 520 +               }
 521 +       }
 522 +       up(&_lockqueue_lock);
 523 +
 524 +       if (count)
 525 +               wake_astd();
 526 +
 527 +       if (atomic_read(&_astd_running))
 528 +               mod_timer(&_lockqueue_timer,
 529 +                         jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
 530 +}
 531 +
 532 +/* Look for deadlocks */
 533 +static void process_deadlockqueue(void)
 534 +{
 535 +       gd_lkb_t *lkb, *safe;
 536 +
 537 +       down(&_deadlockqueue_lock);
 538 +
 539 +       list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
 540 +               gd_lkb_t *kill_lkb;
 541 +
 542 +               /* Only look at "due" locks */
 543 +               if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
 544 +                       break;
 545 +
 546 +               /* Don't look at locks that are in transition */
 547 +               if (!test_bit(LSFL_LS_RUN,
 548 +                             &lkb->lkb_resource->res_ls->ls_flags))
 549 +                       continue;
 550 +
 551 +               up(&_deadlockqueue_lock);
 552 +
 553 +               /* Lock has hit due time, check for conversion deadlock */
 554 +               kill_lkb = conversion_deadlock_check(lkb);
 555 +               if (kill_lkb)
 556 +                       cancel_conversion(kill_lkb, -EDEADLOCK);
 557 +
 558 +               down(&_deadlockqueue_lock);
 559 +       }
 560 +       up(&_deadlockqueue_lock);
 561 +}
 562 +
 563 +static __inline__ int no_asts(void)
 564 +{
 565 +       int ret;
 566 +
 567 +       down(&_ast_queue_lock);
 568 +       ret = list_empty(&_ast_queue);
 569 +       up(&_ast_queue_lock);
 570 +       return ret;
 571 +}
 572 +
 573 +static void lockqueue_timer_fn(unsigned long arg)
 574 +{
 575 +       set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
 576 +       wake_up(&_astd_waitchan);
 577 +}
 578 +
 579 +/*
 580 + * DLM daemon which delivers asts.
 581 + */
 582 +
 583 +static int dlm_astd(void *data)
 584 +{
 585 +       daemonize("dlm_astd");
 586 +
 587 +       INIT_LIST_HEAD(&_lockqueue);
 588 +       init_MUTEX(&_lockqueue_lock);
 589 +       INIT_LIST_HEAD(&_deadlockqueue);
 590 +       init_MUTEX(&_deadlockqueue_lock);
 591 +       INIT_LIST_HEAD(&_ast_queue);
 592 +       init_MUTEX(&_ast_queue_lock);
 593 +       init_waitqueue_head(&_astd_waitchan);
 594 +       complete(&_astd_done);
 595 +
 596 +       /*
 597 +        * Set a timer to check the lockqueue for dead locks (and deadlocks).
 598 +        */
 599 +
 600 +       init_timer(&_lockqueue_timer);
 601 +       _lockqueue_timer.function = lockqueue_timer_fn;
 602 +       _lockqueue_timer.data = 0;
 603 +       mod_timer(&_lockqueue_timer,
 604 +                 jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
 605 +
 606 +       while (atomic_read(&_astd_running)) {
 607 +               wchan_cond_sleep_intr(_astd_waitchan, no_asts());
 608 +
 609 +               if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
 610 +                       process_asts();
 611 +
 612 +               if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
 613 +                       process_lockqueue();
 614 +                       if (dlm_config.deadlocktime)
 615 +                               process_deadlockqueue();
 616 +               }
 617 +       }
 618 +
 619 +       if (timer_pending(&_lockqueue_timer))
 620 +               del_timer(&_lockqueue_timer);
 621 +
 622 +       complete(&_astd_done);
 623 +
 624 +       return 0;
 625 +}
 626 +
 627 +void wake_astd(void)
 628 +{
 629 +       set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
 630 +       wake_up(&_astd_waitchan);
 631 +}
 632 +
 633 +int astd_start()
 634 +{
 635 +       init_completion(&_astd_done);
 636 +       atomic_set(&_astd_running, 1);
 637 +       _astd_pid = kernel_thread(dlm_astd, NULL, 0);
 638 +       wait_for_completion(&_astd_done);
 639 +       return 0;
 640 +}
 641 +
 642 +void astd_stop()
 643 +{
 644 +       atomic_set(&_astd_running, 0);
 645 +       wake_astd();
 646 +       wait_for_completion(&_astd_done);
 647 +}
 648 diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
 649 --- linux-orig/cluster/dlm/ast.h        1970-01-01 07:30:00.000000000 +0730
 650 +++ linux-patched/cluster/dlm/ast.h     2004-06-25 18:31:07.000000000 +0800
 651 @@ -0,0 +1,29 @@
 652 +/******************************************************************************
 653 +*******************************************************************************
 654 +**
 655 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 656 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 657 +**
 658 +**  This copyrighted material is made available to anyone wishing to use,
 659 +**  modify, copy, or redistribute it subject to the terms and conditions
 660 +**  of the GNU General Public License v.2.
 661 +**
 662 +*******************************************************************************
 663 +******************************************************************************/
 664 +
 665 +#ifndef __AST_DOT_H__
 666 +#define __AST_DOT_H__
 667 +
 668 +void lockqueue_lkb_mark(gd_ls_t * ls);
 669 +int resend_cluster_requests(gd_ls_t * ls);
 670 +void add_to_lockqueue(gd_lkb_t * lkb);
 671 +void remove_from_lockqueue(gd_lkb_t * lkb);
 672 +void add_to_deadlockqueue(gd_lkb_t * lkb);
 673 +void remove_from_deadlockqueue(gd_lkb_t * lkb);
 674 +void remove_from_astqueue(gd_lkb_t * lkb);
 675 +void queue_ast(gd_lkb_t * lkb, gd_ast_type_t astt, uint8_t rqmode);
 676 +void wake_astd(void);
 677 +int astd_start(void);
 678 +void astd_stop(void);
 679 +
 680 +#endif                         /* __AST_DOT_H__ */
 681 diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
 682 --- linux-orig/cluster/dlm/config.c     1970-01-01 07:30:00.000000000 +0730
 683 +++ linux-patched/cluster/dlm/config.c  2004-06-25 18:31:07.000000000 +0800
 684 @@ -0,0 +1,125 @@
 685 +/******************************************************************************
 686 +*******************************************************************************
 687 +**
 688 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 689 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 690 +**
 691 +**  This copyrighted material is made available to anyone wishing to use,
 692 +**  modify, copy, or redistribute it subject to the terms and conditions
 693 +**  of the GNU General Public License v.2.
 694 +**
 695 +*******************************************************************************
 696 +******************************************************************************/
 697 +
 698 +#include <linux/module.h>
 699 +#include <linux/proc_fs.h>
 700 +
 701 +#include "dlm_internal.h"
 702 +#include "lowcomms.h"
 703 +#include "config.h"
 704 +
 705 +/* Config file defaults */
 706 +#define DEFAULT_TCP_PORT       21064
 707 +#define DEFAULT_LOCK_TIMEOUT      30
 708 +#define DEFAULT_BUFFER_SIZE     4096
 709 +#define DEFAULT_RESHASHTBL       256
 710 +#define DEFAULT_LOCKIDTBL       1024
 711 +#define DEFAULT_MAX_CONNECTIONS  128
 712 +#define DEFAULT_DEADLOCKTIME      10
 713 +
 714 +struct config_info dlm_config = {
 715 +       .tcp_port = DEFAULT_TCP_PORT,
 716 +       .lock_timeout = DEFAULT_LOCK_TIMEOUT,
 717 +       .buffer_size = DEFAULT_BUFFER_SIZE,
 718 +       .reshashtbl = DEFAULT_RESHASHTBL,
 719 +       .lockidtbl = DEFAULT_LOCKIDTBL,
 720 +       .max_connections = DEFAULT_MAX_CONNECTIONS,
 721 +       .deadlocktime = DEFAULT_DEADLOCKTIME,
 722 +};
 723 +
 724 +
 725 +static struct config_proc_info {
 726 +    char *name;
 727 +    int  *value;
 728 +} config_proc[] = {
 729 +    {
 730 +       .name = "tcp_port",
 731 +       .value = &dlm_config.tcp_port,
 732 +    },
 733 +    {
 734 +       .name = "lock_timeout",
 735 +       .value = &dlm_config.lock_timeout,
 736 +    },
 737 +    {
 738 +       .name = "buffer_size",
 739 +       .value = &dlm_config.buffer_size,
 740 +    },
 741 +    {
 742 +       .name = "reshashtbl",
 743 +       .value = &dlm_config.reshashtbl,
 744 +    },
 745 +    {
 746 +       .name = "lockidtbl",
 747 +       .value = &dlm_config.lockidtbl,
 748 +    },
 749 +    {
 750 +       .name = "max_connections",
 751 +       .value = &dlm_config.max_connections,
 752 +    },
 753 +    {
 754 +       .name = "deadlocktime",
 755 +       .value = &dlm_config.deadlocktime,
 756 +    },
 757 +};
 758 +static struct proc_dir_entry *dlm_dir;
 759 +
 760 +static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
 761 +                               int *eof, void *data)
 762 +{
 763 +       struct config_proc_info *cinfo = data;
 764 +       return snprintf(page, count, "%d\n", *cinfo->value);
 765 +}
 766 +
 767 +static int dlm_config_write_proc(struct file *file, const char *buffer,
 768 +                                unsigned long count, void *data)
 769 +{
 770 +       struct config_proc_info *cinfo = data;
 771 +       int value;
 772 +       char *end;
 773 +
 774 +       value = simple_strtoul(buffer, &end, 10);
 775 +       if (*end)
 776 +               *cinfo->value = value;
 777 +       return count;
 778 +}
 779 +
 780 +int dlm_config_init(void)
 781 +{
 782 +       int i;
 783 +       struct proc_dir_entry *pde;
 784 +
 785 +       dlm_dir = proc_mkdir("cluster/config/dlm", 0);
 786 +       if (!dlm_dir)
 787 +               return -1;
 788 +
 789 +       dlm_dir->owner = THIS_MODULE;
 790 +
 791 +       for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
 792 +               pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
 793 +               if (pde) {
 794 +                       pde->data = &config_proc[i];
 795 +                       pde->write_proc = dlm_config_write_proc;
 796 +                       pde->read_proc = dlm_config_read_proc;
 797 +               }
 798 +       }
 799 +       return 0;
 800 +}
 801 +
 802 +void dlm_config_exit(void)
 803 +{
 804 +       int i;
 805 +
 806 +       for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
 807 +               remove_proc_entry(config_proc[i].name, dlm_dir);
 808 +       remove_proc_entry("cluster/config/dlm", NULL);
 809 +}
 810 diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
 811 --- linux-orig/cluster/dlm/config.h     1970-01-01 07:30:00.000000000 +0730
 812 +++ linux-patched/cluster/dlm/config.h  2004-06-25 18:31:07.000000000 +0800
 813 @@ -0,0 +1,31 @@
 814 +/******************************************************************************
 815 +*******************************************************************************
 816 +**
 817 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 818 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 819 +**
 820 +**  This copyrighted material is made available to anyone wishing to use,
 821 +**  modify, copy, or redistribute it subject to the terms and conditions
 822 +**  of the GNU General Public License v.2.
 823 +**
 824 +*******************************************************************************
 825 +******************************************************************************/
 826 +
 827 +#ifndef __CONFIG_DOT_H__
 828 +#define __CONFIG_DOT_H__
 829 +
 830 +struct config_info {
 831 +       int tcp_port;
 832 +       int lock_timeout;
 833 +       int buffer_size;
 834 +       int reshashtbl;
 835 +       int lockidtbl;
 836 +       int max_connections;
 837 +       int deadlocktime;
 838 +};
 839 +
 840 +extern struct config_info dlm_config;
 841 +extern int  dlm_config_init(void);
 842 +extern void dlm_config_exit(void);
 843 +
 844 +#endif                         /* __CONFIG_DOT_H__ */
 845 diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
 846 --- linux-orig/cluster/dlm/device.c     1970-01-01 07:30:00.000000000 +0730
 847 +++ linux-patched/cluster/dlm/device.c  2004-06-25 18:31:07.000000000 +0800
 848 @@ -0,0 +1,1020 @@
 849 +/******************************************************************************
 850 +*******************************************************************************
 851 +**
 852 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 853 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 854 +**
 855 +**  This copyrighted material is made available to anyone wishing to use,
 856 +**  modify, copy, or redistribute it subject to the terms and conditions
 857 +**  of the GNU General Public License v.2.
 858 +**
 859 +*******************************************************************************
 860 +******************************************************************************/
 861 +
 862 +/*
 863 + * device.c
 864 + *
 865 + * This is the userland interface to the DLM.
 866 + *
 867 + * The locking is done via a misc char device (find the
 868 + * registered minor number in /proc/misc).
 869 + *
 870 + * User code should not use this interface directly but
 871 + * call the library routines in libdlm.a instead.
 872 + *
 873 + */
 874 +
 875 +#include <linux/miscdevice.h>
 876 +#include <linux/init.h>
 877 +#include <linux/wait.h>
 878 +#include <linux/module.h>
 879 +#include <linux/file.h>
 880 +#include <linux/fs.h>
 881 +#include <linux/poll.h>
 882 +#include <linux/signal.h>
 883 +#include <linux/spinlock.h>
 884 +#include <asm/ioctls.h>
 885 +
 886 +#include "dlm_internal.h"
 887 +#include "device.h"
 888 +
 889 +extern gd_lkb_t *dlm_get_lkb(gd_ls_t *, int);
 890 +static struct file_operations _dlm_fops;
 891 +static const char *name_prefix="dlm";
 892 +static struct list_head user_ls_list;
 893 +
 894 +/* Flags in li_flags */
 895 +#define LI_FLAG_COMPLETE  1
 896 +#define LI_FLAG_FIRSTLOCK 2
 897 +
 898 +struct lock_info {
 899 +       uint8_t li_cmd;
 900 +       struct dlm_lksb li_lksb;
 901 +       wait_queue_head_t li_waitq;
 902 +       unsigned long li_flags;
 903 +       void __user *li_astparam;
 904 +       void __user *li_astaddr;
 905 +       void __user *li_bastaddr;
 906 +       struct file_info *li_file;
 907 +       struct dlm_lksb __user *li_user_lksb;
 908 +       struct semaphore li_firstlock;
 909 +       struct dlm_queryinfo *li_queryinfo;
 910 +       struct dlm_queryinfo __user *li_user_queryinfo;
 911 +};
 912 +
 913 +/* A queued AST no less */
 914 +struct ast_info {
 915 +       struct dlm_lock_result result;
 916 +       struct dlm_queryinfo *queryinfo;
 917 +       struct dlm_queryinfo __user *user_queryinfo;
 918 +       struct list_head list;
 919 +};
 920 +
 921 +/* One of these per userland lockspace */
 922 +struct user_ls {
 923 +       void    *ls_lockspace;
 924 +       atomic_t ls_refcnt;
 925 +       long     ls_flags; /* bit 1 means LS has been deleted */
 926 +
 927 +       /* Passed into misc_register() */
 928 +       struct miscdevice ls_miscinfo;
 929 +       struct list_head  ls_list;
 930 +};
 931 +
 932 +/* misc_device info for the control device */
 933 +static struct miscdevice ctl_device;
 934 +
 935 +/*
 936 + * Stuff we hang off the file struct.
 937 + * The first two are to cope with unlocking all the
 938 + * locks help by a process when it dies.
 939 + */
 940 +struct file_info {
 941 +       struct list_head    fi_lkb_list;     /* List of active lkbs */
 942 +       spinlock_t          fi_lkb_lock;
 943 +       struct list_head    fi_ast_list;     /* Queue of ASTs to be delivered */
 944 +       spinlock_t          fi_ast_lock;
 945 +       wait_queue_head_t   fi_wait;
 946 +       struct user_ls     *fi_ls;
 947 +       atomic_t            fi_refcnt;       /* Number of users */
 948 +       unsigned long       fi_flags;        /* Bit 1 means the device is open */
 949 +};
 950 +
 951 +
 952 +/* get and put ops for file_info.
 953 +   Actually I don't really like "get" and "put", but everyone
 954 +   else seems to use them and I can't think of anything
 955 +   nicer at the moment */
 956 +static void get_file_info(struct file_info *f)
 957 +{
 958 +       atomic_inc(&f->fi_refcnt);
 959 +}
 960 +
 961 +static void put_file_info(struct file_info *f)
 962 +{
 963 +       if (atomic_dec_and_test(&f->fi_refcnt))
 964 +               kfree(f);
 965 +}
 966 +
 967 +/* Find a lockspace struct given the device minor number */
 968 +static struct user_ls *find_lockspace(int minor)
 969 +{
 970 +       struct user_ls *lsinfo;
 971 +
 972 +       list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
 973 +
 974 +               if (lsinfo->ls_miscinfo.minor == minor)
 975 +                       return lsinfo;
 976 +       }
 977 +       return NULL;
 978 +}
 979 +
 980 +static void add_lockspace_to_list(struct user_ls *lsinfo)
 981 +{
 982 +       list_add(&lsinfo->ls_list, &user_ls_list);
 983 +}
 984 +
 985 +/* Register a lockspace with the DLM and create a misc
 986 +   device for userland to access it */
 987 +static int register_lockspace(char *name, struct user_ls **ls)
 988 +{
 989 +       struct user_ls *newls;
 990 +       int status;
 991 +       int namelen;
 992 +
 993 +       namelen = strlen(name)+strlen(name_prefix)+2;
 994 +
 995 +       newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
 996 +       if (!newls)
 997 +               return -ENOMEM;
 998 +       memset(newls, 0, sizeof(struct user_ls));
 999 +
1000 +       newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
1001 +       if (!newls->ls_miscinfo.name) {
1002 +               kfree(newls);
1003 +               return -ENOMEM;
1004 +       }
1005 +       snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
1006 +
1007 +       status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
1008 +                                   strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
1009 +                                   &newls->ls_lockspace, 0);
1010 +
1011 +       if (status != 0) {
1012 +               kfree(newls->ls_miscinfo.name);
1013 +               kfree(newls);
1014 +               return status;
1015 +       }
1016 +
1017 +       newls->ls_miscinfo.fops = &_dlm_fops;
1018 +       newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
1019 +
1020 +       status = misc_register(&newls->ls_miscinfo);
1021 +       if (status) {
1022 +               log_print("failed to register misc device for %s", name);
1023 +               dlm_release_lockspace(newls->ls_lockspace, 0);
1024 +               kfree(newls->ls_miscinfo.name);
1025 +               kfree(newls);
1026 +               return status;
1027 +       }
1028 +
1029 +
1030 +       add_lockspace_to_list(newls);
1031 +       *ls = newls;
1032 +       return 0;
1033 +}
1034 +
1035 +static int unregister_lockspace(struct user_ls *lsinfo, int force)
1036 +{
1037 +       int status;
1038 +
1039 +       status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1040 +       if (status)
1041 +               return status;
1042 +
1043 +       status = misc_deregister(&lsinfo->ls_miscinfo);
1044 +       if (status)
1045 +               return status;
1046 +
1047 +       list_del(&lsinfo->ls_list);
1048 +       kfree(lsinfo->ls_miscinfo.name);
1049 +       kfree(lsinfo);
1050 +
1051 +       return 0;
1052 +}
1053 +
1054 +/* Add it to userland's AST queue */
1055 +static void add_to_astqueue(struct lock_info *li, void *astaddr)
1056 +{
1057 +       struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1058 +       if (!ast)
1059 +               return;
1060 +
1061 +       ast->result.astparam  = li->li_astparam;
1062 +       ast->result.astaddr   = astaddr;
1063 +       ast->result.user_lksb = li->li_user_lksb;
1064 +       ast->result.cmd       = li->li_cmd;
1065 +       memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1066 +
1067 +       /* These two will both be NULL for anything other than queries */
1068 +       ast->queryinfo        = li->li_queryinfo;
1069 +       ast->user_queryinfo   = li->li_user_queryinfo;
1070 +
1071 +       spin_lock(&li->li_file->fi_ast_lock);
1072 +       list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1073 +       spin_unlock(&li->li_file->fi_ast_lock);
1074 +       wake_up_interruptible(&li->li_file->fi_wait);
1075 +}
1076 +
1077 +static void bast_routine(void *param, int mode)
1078 +{
1079 +       struct lock_info *li = param;
1080 +
1081 +       if (param) {
1082 +               add_to_astqueue(li, li->li_bastaddr);
1083 +       }
1084 +}
1085 +
1086 +/*
1087 + * This is the kernel's AST routine.
1088 + * All lock, unlock & query operations complete here.
1089 + * The only syncronous ops are those done during device close.
1090 + */
1091 +static void ast_routine(void *param)
1092 +{
1093 +       struct lock_info *li = param;
1094 +
1095 +       /* Param may be NULL if a persistent lock is unlocked by someone else */
1096 +       if (!param)
1097 +               return;
1098 +
1099 +       /* If it's an async request then post data to the user's AST queue. */
1100 +       if (li->li_astaddr) {
1101 +
1102 +               /* Only queue AST if the device is still open */
1103 +               if (test_bit(1, &li->li_file->fi_flags))
1104 +                       add_to_astqueue(li, li->li_astaddr);
1105 +
1106 +               /* If it's a new lock operation that failed, then
1107 +                * remove it from the owner queue and free the
1108 +                * lock_info. The DLM will not free the LKB until this
1109 +                * AST has completed.
1110 +                */
1111 +               if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1112 +                   li->li_lksb.sb_status != 0) {
1113 +                       gd_lkb_t *lkb;
1114 +
1115 +                       /* Wait till dlm_lock() has finished */
1116 +                       down(&li->li_firstlock);
1117 +                       lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1118 +                       if (lkb) {
1119 +                               spin_lock(&li->li_file->fi_lkb_lock);
1120 +                               list_del(&lkb->lkb_ownerqueue);
1121 +                               spin_unlock(&li->li_file->fi_lkb_lock);
1122 +                       }
1123 +                       up(&li->li_firstlock);
1124 +                       put_file_info(li->li_file);
1125 +                       kfree(li);
1126 +                       return;
1127 +               }
1128 +               /* Free unlocks & queries */
1129 +               if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1130 +                   li->li_cmd == DLM_USER_QUERY) {
1131 +                       put_file_info(li->li_file);
1132 +                       kfree(li);
1133 +               }
1134 +       }
1135 +       else {
1136 +               /* Syncronous request, just wake up the caller */
1137 +               set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1138 +               wake_up_interruptible(&li->li_waitq);
1139 +       }
1140 +}
1141 +
1142 +/*
1143 + * Wait for the lock op to complete and return the status.
1144 + */
1145 +static int wait_for_ast(struct lock_info *li)
1146 +{
1147 +       /* Wait for the AST routine to complete */
1148 +       set_task_state(current, TASK_INTERRUPTIBLE);
1149 +       while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1150 +               schedule();
1151 +
1152 +       set_task_state(current, TASK_RUNNING);
1153 +
1154 +       return li->li_lksb.sb_status;
1155 +}
1156 +
1157 +
1158 +/* Open on control device */
1159 +static int dlm_ctl_open(struct inode *inode, struct file *file)
1160 +{
1161 +       return 0;
1162 +}
1163 +
1164 +/* Close on control device */
1165 +static int dlm_ctl_close(struct inode *inode, struct file *file)
1166 +{
1167 +       return 0;
1168 +}
1169 +
1170 +/* Open on lockspace device */
1171 +static int dlm_open(struct inode *inode, struct file *file)
1172 +{
1173 +       struct file_info *f;
1174 +       struct user_ls *lsinfo;
1175 +
1176 +       lsinfo = find_lockspace(iminor(inode));
1177 +       if (!lsinfo)
1178 +               return -ENOENT;
1179 +
1180 +       f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1181 +       if (!f)
1182 +               return -ENOMEM;
1183 +
1184 +       atomic_inc(&lsinfo->ls_refcnt);
1185 +       INIT_LIST_HEAD(&f->fi_lkb_list);
1186 +       INIT_LIST_HEAD(&f->fi_ast_list);
1187 +       spin_lock_init(&f->fi_ast_lock);
1188 +       spin_lock_init(&f->fi_lkb_lock);
1189 +       init_waitqueue_head(&f->fi_wait);
1190 +       f->fi_ls = lsinfo;
1191 +       atomic_set(&f->fi_refcnt, 1);
1192 +       set_bit(1, &f->fi_flags);
1193 +
1194 +       file->private_data = f;
1195 +
1196 +       return 0;
1197 +}
1198 +
1199 +/* Check the user's version matches ours */
1200 +static int check_version(struct dlm_lock_params *params)
1201 +{
1202 +       if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1203 +           (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1204 +            params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1205 +
1206 +               log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1207 +                      params->version[0],
1208 +                      params->version[1],
1209 +                      params->version[2],
1210 +                      DLM_DEVICE_VERSION_MAJOR,
1211 +                      DLM_DEVICE_VERSION_MINOR,
1212 +                      DLM_DEVICE_VERSION_PATCH);
1213 +               return -EINVAL;
1214 +       }
1215 +       return 0;
1216 +}
1217 +
1218 +/* Close on lockspace device */
1219 +static int dlm_close(struct inode *inode, struct file *file)
1220 +{
1221 +       struct file_info *f = file->private_data;
1222 +       struct lock_info li;
1223 +       sigset_t tmpsig;
1224 +       sigset_t allsigs;
1225 +       gd_lkb_t *lkb, *safe;
1226 +       struct user_ls *lsinfo;
1227 +       DECLARE_WAITQUEUE(wq, current);
1228 +
1229 +       lsinfo = find_lockspace(iminor(inode));
1230 +       if (!lsinfo)
1231 +               return -ENOENT;
1232 +
1233 +       /* Mark this closed so that ASTs will not be delivered any more */
1234 +       clear_bit(1, &f->fi_flags);
1235 +
1236 +       /* Block signals while we are doing this */
1237 +       sigfillset(&allsigs);
1238 +       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1239 +
1240 +       /* We use our own lock_info struct here, so that any
1241 +        * outstanding "real" ASTs will be delivered with the
1242 +        * corresponding "real" params, thus freeing the lock_info
1243 +        * that belongs the lock. This catches the corner case where
1244 +        * a lock is BUSY when we try to unlock it here
1245 +        */
1246 +       memset(&li, 0, sizeof(li));
1247 +       clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1248 +       init_waitqueue_head(&li.li_waitq);
1249 +       add_wait_queue(&li.li_waitq, &wq);
1250 +
1251 +       /*
1252 +        * Free any outstanding locks, they are on the
1253 +        * list in LIFO order so there should be no problems
1254 +        * about unlocking parents before children.
1255 +        * Although we don't remove the lkbs from the list here
1256 +        * (what would be the point?), foreach_safe is needed
1257 +        * because the lkbs are freed during dlm_unlock operations
1258 +        */
1259 +       list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
1260 +               int status;
1261 +               int lock_status;
1262 +               int flags = 0;
1263 +               struct lock_info *old_li;
1264 +
1265 +               /* Make a copy of this pointer. If all goes well we will
1266 +                * free it later. if not it will be left to the AST routine
1267 +                * to tidy up
1268 +                */
1269 +               old_li = (struct lock_info *)lkb->lkb_astparam;
1270 +
1271 +               /* Don't unlock persistent locks */
1272 +               if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
1273 +                       list_del(&lkb->lkb_ownerqueue);
1274 +
1275 +                       /* But tidy our references in it */
1276 +                       kfree(old_li);
1277 +                       lkb->lkb_astparam = (long)NULL;
1278 +                       put_file_info(f);
1279 +                       continue;
1280 +               }
1281 +
1282 +               clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1283 +
1284 +               /* If it's not granted then cancel the request.
1285 +                * If the lock was WAITING then it will be dropped,
1286 +                *    if it was converting then it will be reverted to GRANTED,
1287 +                *    then we will unlock it.
1288 +                */
1289 +               lock_status = lkb->lkb_status;
1290 +
1291 +               if (lock_status != GDLM_LKSTS_GRANTED)
1292 +                       flags = DLM_LKF_CANCEL;
1293 +
1294 +               status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1295 +
1296 +               /* Must wait for it to complete as the next lock could be its
1297 +                * parent */
1298 +               if (status == 0)
1299 +                       wait_for_ast(&li);
1300 +
1301 +               /* If it was waiting for a conversion, it will
1302 +                  now be granted so we can unlock it properly */
1303 +               if (lock_status == GDLM_LKSTS_CONVERT) {
1304 +
1305 +                       clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1306 +                       status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
1307 +
1308 +                       if (status == 0)
1309 +                               wait_for_ast(&li);
1310 +               }
1311 +               /* Unlock suceeded, free the lock_info struct. */
1312 +               if (status == 0) {
1313 +                       kfree(old_li);
1314 +                       put_file_info(f);
1315 +               }
1316 +       }
1317 +
1318 +       remove_wait_queue(&li.li_waitq, &wq);
1319 +
1320 +       /* If this is the last reference, and the lockspace has been deleted
1321 +          the free the struct */
1322 +       if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
1323 +               kfree(lsinfo);
1324 +       }
1325 +
1326 +       /* Restore signals */
1327 +       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1328 +       recalc_sigpending();
1329 +
1330 +       return 0;
1331 +}
1332 +
1333 +/*
1334 + * ioctls to create/remove lockspaces, and check how many
1335 + * outstanding ASTs there are against a particular LS.
1336 + */
1337 +static int dlm_ioctl(struct inode *inode, struct file *file,
1338 +                    uint command, ulong u)
1339 +{
1340 +       struct file_info *fi = file->private_data;
1341 +       int status = -EINVAL;
1342 +       int count;
1343 +       struct list_head *tmp_list;
1344 +
1345 +       switch (command) {
1346 +
1347 +               /* Are there any ASTs for us to read?
1348 +                * Warning, this returns the number of messages (ASTs)
1349 +                * in the queue, NOT the number of bytes to read
1350 +                */
1351 +       case FIONREAD:
1352 +               count = 0;
1353 +               spin_lock(&fi->fi_ast_lock);
1354 +               list_for_each(tmp_list, &fi->fi_ast_list)
1355 +                       count++;
1356 +               spin_unlock(&fi->fi_ast_lock);
1357 +               status = put_user(count, (int *)u);
1358 +               break;
1359 +
1360 +       default:
1361 +               return -ENOTTY;
1362 +       }
1363 +
1364 +       return status;
1365 +}
1366 +
1367 +/*
1368 + * ioctls to create/remove lockspaces.
1369 + */
1370 +static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1371 +                        uint command, ulong u)
1372 +{
1373 +       int status = -EINVAL;
1374 +       char ls_name[MAX_LS_NAME_LEN];
1375 +       struct user_ls *lsinfo;
1376 +       int force = 0;
1377 +
1378 +       switch (command) {
1379 +       case DLM_CREATE_LOCKSPACE:
1380 +               if (!capable(CAP_SYS_ADMIN))
1381 +                       return -EPERM;
1382 +
1383 +               if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1384 +                       return -EFAULT;
1385 +               status = register_lockspace(ls_name, &lsinfo);
1386 +
1387 +               /* If it succeeded then return the minor number */
1388 +               if (status == 0)
1389 +                       status = lsinfo->ls_miscinfo.minor;
1390 +               break;
1391 +
1392 +       case DLM_FORCE_RELEASE_LOCKSPACE:
1393 +               force = 2;
1394 +
1395 +       case DLM_RELEASE_LOCKSPACE:
1396 +               if (!capable(CAP_SYS_ADMIN))
1397 +                       return -EPERM;
1398 +
1399 +               lsinfo = find_lockspace(u);
1400 +               if (!lsinfo)
1401 +                       return -EINVAL;
1402 +               status = unregister_lockspace(lsinfo, force);
1403 +               break;
1404 +
1405 +       default:
1406 +               return -ENOTTY;
1407 +       }
1408 +
1409 +       return status;
1410 +}
1411 +
1412 +/* Deal with the messy stuff of copying a web of structs
1413 +   from kernel space to userspace */
1414 +static int copy_query_result(struct ast_info *ast)
1415 +{
1416 +       int status = -EFAULT;
1417 +       struct dlm_queryinfo qi;
1418 +
1419 +       /* Get the pointers to userspace structs */
1420 +       if (copy_from_user(&qi, ast->user_queryinfo,
1421 +                          sizeof(struct dlm_queryinfo)))
1422 +               goto copy_out;
1423 +
1424 +       /* TODO: does this deref a user pointer? */
1425 +       if (put_user(ast->queryinfo->gqi_lockcount,
1426 +                    &ast->user_queryinfo->gqi_lockcount))
1427 +               goto copy_out;
1428 +
1429 +       if (qi.gqi_resinfo) {
1430 +               if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1431 +                                sizeof(struct dlm_resinfo)))
1432 +                       goto copy_out;
1433 +       }
1434 +
1435 +       if (qi.gqi_lockinfo) {
1436 +               if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1437 +                                sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1438 +                       goto copy_out;
1439 +       }
1440 +
1441 +       status = 0;
1442 +
1443 +       if (ast->queryinfo->gqi_lockinfo)
1444 +               kfree(ast->queryinfo->gqi_lockinfo);
1445 +
1446 +       if (ast->queryinfo->gqi_resinfo)
1447 +               kfree(ast->queryinfo->gqi_resinfo);
1448 +
1449 +       kfree(ast->queryinfo);
1450 +
1451 + copy_out:
1452 +       return status;
1453 +}
1454 +
1455 +/* Read call, might block if no ASTs are waiting.
1456 + * It will only ever return one message at a time, regardless
1457 + * of how many are pending.
1458 + */
1459 +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1460 +{
1461 +       struct file_info *fi = file->private_data;
1462 +       struct ast_info *ast;
1463 +       int ret;
1464 +       DECLARE_WAITQUEUE(wait, current);
1465 +
1466 +       if (count < sizeof(struct dlm_lock_result))
1467 +               return -EINVAL;
1468 +
1469 +       spin_lock(&fi->fi_ast_lock);
1470 +       if (list_empty(&fi->fi_ast_list)) {
1471 +
1472 +               /* No waiting ASTs.
1473 +                * Return EOF if the lockspace been deleted.
1474 +                */
1475 +               if (test_bit(1, &fi->fi_ls->ls_flags))
1476 +                       return 0;
1477 +
1478 +               if (file->f_flags & O_NONBLOCK) {
1479 +                       spin_unlock(&fi->fi_ast_lock);
1480 +                       return -EAGAIN;
1481 +               }
1482 +
1483 +               add_wait_queue(&fi->fi_wait, &wait);
1484 +
1485 +       repeat:
1486 +               set_current_state(TASK_INTERRUPTIBLE);
1487 +               if (list_empty(&fi->fi_ast_list) &&
1488 +                   !signal_pending(current)) {
1489 +
1490 +                       spin_unlock(&fi->fi_ast_lock);
1491 +                       schedule();
1492 +                       spin_lock(&fi->fi_ast_lock);
1493 +                       goto repeat;
1494 +               }
1495 +
1496 +               current->state = TASK_RUNNING;
1497 +               remove_wait_queue(&fi->fi_wait, &wait);
1498 +
1499 +               if (signal_pending(current)) {
1500 +                       spin_unlock(&fi->fi_ast_lock);
1501 +                       return -ERESTARTSYS;
1502 +               }
1503 +       }
1504 +
1505 +       ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1506 +       list_del(&ast->list);
1507 +       spin_unlock(&fi->fi_ast_lock);
1508 +
1509 +       ret = sizeof(struct dlm_lock_result);
1510 +       if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1511 +               ret = -EFAULT;
1512 +
1513 +       /* If it was a query then copy the result block back here */
1514 +       if (ast->queryinfo) {
1515 +               int status = copy_query_result(ast);
1516 +               if (status)
1517 +                       ret = status;
1518 +       }
1519 +
1520 +       kfree(ast);
1521 +       return ret;
1522 +}
1523 +
1524 +static unsigned int dlm_poll(struct file *file, poll_table *wait)
1525 +{
1526 +       struct file_info *fi = file->private_data;
1527 +
1528 +       poll_wait(file, &fi->fi_wait, wait);
1529 +
1530 +       spin_lock(&fi->fi_ast_lock);
1531 +       if (!list_empty(&fi->fi_ast_list)) {
1532 +               spin_unlock(&fi->fi_ast_lock);
1533 +               return POLLIN | POLLRDNORM;
1534 +       }
1535 +
1536 +       spin_unlock(&fi->fi_ast_lock);
1537 +       return 0;
1538 +}
1539 +
1540 +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1541 +{
1542 +       struct lock_info *li;
1543 +       int status;
1544 +
1545 +       li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1546 +       if (!li)
1547 +               return -ENOMEM;
1548 +
1549 +       get_file_info(fi);
1550 +       li->li_user_lksb = kparams->lksb;
1551 +       li->li_astparam  = kparams->astparam;
1552 +       li->li_bastaddr  = kparams->bastaddr;
1553 +       li->li_astaddr   = kparams->astaddr;
1554 +       li->li_file      = fi;
1555 +       li->li_flags     = 0;
1556 +       li->li_cmd       = kparams->cmd;
1557 +       clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1558 +
1559 +       if (copy_from_user(&li->li_lksb, kparams->lksb,
1560 +                          sizeof(struct dlm_lksb))) {
1561 +               kfree(li);
1562 +               return -EFAULT;
1563 +       }
1564 +       li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1565 +
1566 +       /* Allocate query structs */
1567 +       status = -ENOMEM;
1568 +       li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1569 +       if (!li->li_queryinfo)
1570 +               goto out1;
1571 +
1572 +       /* Mainly to get gqi_lock buffer size */
1573 +       if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1574 +                          sizeof(struct dlm_queryinfo))) {
1575 +               status = -EFAULT;
1576 +               goto out1;
1577 +       }
1578 +
1579 +       /* Overwrite userspace pointers we just copied with kernel space ones */
1580 +       if (li->li_queryinfo->gqi_resinfo) {
1581 +               li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1582 +               if (!li->li_queryinfo->gqi_resinfo)
1583 +                       goto out1;
1584 +       }
1585 +       if (li->li_queryinfo->gqi_lockinfo) {
1586 +               li->li_queryinfo->gqi_lockinfo =
1587 +                       kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1588 +                               GFP_KERNEL);
1589 +               if (!li->li_queryinfo->gqi_lockinfo)
1590 +                       goto out2;
1591 +       }
1592 +
1593 +       li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1594 +
1595 +       return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1596 +                         kparams->flags, /* query */
1597 +                         li->li_queryinfo,
1598 +                         ast_routine, li);
1599 +
1600 + out2:
1601 +       kfree(li->li_queryinfo);
1602 +
1603 + out1:
1604 +       kfree(li);
1605 +       return status;
1606 +}
1607 +
1608 +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1609 +                       const char *buffer)
1610 +{
1611 +       struct lock_info *li;
1612 +       int status;
1613 +       char name[DLM_RESNAME_MAXLEN];
1614 +
1615 +       /*
1616 +        * Validate things that we need to have correct.
1617 +        */
1618 +       if (kparams->namelen > DLM_RESNAME_MAXLEN)
1619 +               return -EINVAL;
1620 +
1621 +       if (!kparams->astaddr)
1622 +               return -EINVAL;
1623 +
1624 +       if (!kparams->lksb)
1625 +               return -EINVAL;
1626 +
1627 +       /* Get the lock name */
1628 +       if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1629 +                          kparams->namelen)) {
1630 +               return -EFAULT;
1631 +       }
1632 +
1633 +       /* For conversions, the lock will already have a lock_info
1634 +          block squirelled away in astparam */
1635 +       if (kparams->flags & DLM_LKF_CONVERT) {
1636 +               gd_lkb_t *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1637 +               if (!lkb) {
1638 +                       return -EINVAL;
1639 +               }
1640 +               li = (struct lock_info *)lkb->lkb_astparam;
1641 +
1642 +               /* Only override these if they are provided */
1643 +               if (li->li_user_lksb)
1644 +                       li->li_user_lksb = kparams->lksb;
1645 +               if (li->li_astparam)
1646 +                       li->li_astparam  = kparams->astparam;
1647 +               if (li->li_bastaddr)
1648 +                       li->li_bastaddr  = kparams->bastaddr;
1649 +               if (li->li_bastaddr)
1650 +                       li->li_astaddr   = kparams->astaddr;
1651 +               li->li_flags     = 0;
1652 +       }
1653 +       else {
1654 +               li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1655 +               if (!li)
1656 +                       return -ENOMEM;
1657 +
1658 +               li->li_user_lksb = kparams->lksb;
1659 +               li->li_astparam  = kparams->astparam;
1660 +               li->li_bastaddr  = kparams->bastaddr;
1661 +               li->li_astaddr   = kparams->astaddr;
1662 +               li->li_file      = fi;
1663 +               li->li_flags     = 0;
1664 +               li->li_cmd       = kparams->cmd;
1665 +               li->li_queryinfo  = NULL;
1666 +
1667 +               /* semaphore to allow us to complete our work before
1668 +                  the AST routine runs. In fact we only need (and use) this
1669 +                  when the initial lock fails */
1670 +               init_MUTEX_LOCKED(&li->li_firstlock);
1671 +               set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1672 +
1673 +               get_file_info(fi);
1674 +       }
1675 +
1676 +       /* Copy the user's LKSB into kernel space,
1677 +          needed for conversions & value block operations */
1678 +       if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
1679 +                                           sizeof(struct dlm_lksb)))
1680 +               return -EFAULT;
1681 +
1682 +       /* Lock it ... */
1683 +       status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1684 +                          kparams->flags, name, kparams->namelen,
1685 +                          kparams->parent,
1686 +                          ast_routine,
1687 +                          li,
1688 +                          li->li_bastaddr ? bast_routine : NULL,
1689 +                          kparams->range.ra_end ? &kparams->range : NULL);
1690 +
1691 +       /* If it succeeded (this far) with a new lock then keep track of
1692 +          it on the file's lkb list */
1693 +       if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
1694 +               gd_lkb_t *lkb;
1695 +               lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1696 +
1697 +               if (lkb) {
1698 +                       spin_lock(&fi->fi_lkb_lock);
1699 +                       list_add(&lkb->lkb_ownerqueue,
1700 +                                &fi->fi_lkb_list);
1701 +                       spin_unlock(&fi->fi_lkb_lock);
1702 +               }
1703 +               else {
1704 +                       log_print("failed to get lkb for new lock");
1705 +               }
1706 +               up(&li->li_firstlock);
1707 +       }
1708 +
1709 +       return status;
1710 +}
1711 +
1712 +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1713 +{
1714 +       struct lock_info *li;
1715 +       gd_lkb_t *lkb;
1716 +       int status;
1717 +
1718 +       lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1719 +       if (!lkb) {
1720 +               return -EINVAL;
1721 +       }
1722 +
1723 +       li = (struct lock_info *)lkb->lkb_astparam;
1724 +
1725 +       li->li_user_lksb = kparams->lksb;
1726 +       li->li_astparam  = kparams->astparam;
1727 +       li->li_cmd       = kparams->cmd;
1728 +
1729 +       /* Have to do it here cos the lkb may not exist after
1730 +        * dlm_unlock() */
1731 +       spin_lock(&fi->fi_lkb_lock);
1732 +       list_del(&lkb->lkb_ownerqueue);
1733 +       spin_unlock(&fi->fi_lkb_lock);
1734 +
1735 +       /* Use existing lksb & astparams */
1736 +       status = dlm_unlock(fi->fi_ls->ls_lockspace,
1737 +                            kparams->lkid,
1738 +                            kparams->flags, NULL, NULL);
1739 +
1740 +       return status;
1741 +}
1742 +
1743 +/* Write call, submit a locking request */
1744 +static ssize_t dlm_write(struct file *file, const char __user *buffer,
1745 +                        size_t count, loff_t *ppos)
1746 +{
1747 +       struct file_info *fi = file->private_data;
1748 +       struct dlm_lock_params kparams;
1749 +       sigset_t tmpsig;
1750 +       sigset_t allsigs;
1751 +       int status;
1752 +
1753 +       if (count < sizeof(kparams))
1754 +               return -EINVAL;
1755 +
1756 +       /* Has the lockspace been deleted */
1757 +       if (test_bit(1, &fi->fi_ls->ls_flags))
1758 +               return -ENOENT;
1759 +
1760 +       /* Get the command info */
1761 +       if (copy_from_user(&kparams, buffer, sizeof(kparams)))
1762 +               return -EFAULT;
1763 +
1764 +       if (check_version(&kparams))
1765 +               return -EINVAL;
1766 +
1767 +       /* Block signals while we are doing this */
1768 +       sigfillset(&allsigs);
1769 +       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1770 +
1771 +       switch (kparams.cmd)
1772 +       {
1773 +       case DLM_USER_LOCK:
1774 +               status = do_user_lock(fi, &kparams, buffer);
1775 +               break;
1776 +
1777 +       case DLM_USER_UNLOCK:
1778 +               status = do_user_unlock(fi, &kparams);
1779 +               break;
1780 +
1781 +       case DLM_USER_QUERY:
1782 +               status = do_user_query(fi, &kparams);
1783 +               break;
1784 +
1785 +       default:
1786 +               status = -EINVAL;
1787 +               break;
1788 +       }
1789 +       /* Restore signals */
1790 +       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1791 +       recalc_sigpending();
1792 +
1793 +       if (status == 0)
1794 +               return count;
1795 +       else
1796 +               return status;
1797 +}
1798 +
1799 +void dlm_device_free_devices()
1800 +{
1801 +       struct user_ls *tmp;
1802 +       struct user_ls *lsinfo;
1803 +
1804 +       list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
1805 +               misc_deregister(&lsinfo->ls_miscinfo);
1806 +
1807 +               /* Tidy up, but don't delete the lsinfo struct until
1808 +                  all the users have closed their devices */
1809 +               list_del(&lsinfo->ls_list);
1810 +               kfree(lsinfo->ls_miscinfo.name);
1811 +               set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1812 +       }
1813 +}
1814 +
1815 +static struct file_operations _dlm_fops = {
1816 +      .open    = dlm_open,
1817 +      .release = dlm_close,
1818 +      .ioctl   = dlm_ioctl,
1819 +      .read    = dlm_read,
1820 +      .write   = dlm_write,
1821 +      .poll    = dlm_poll,
1822 +      .owner   = THIS_MODULE,
1823 +};
1824 +
1825 +static struct file_operations _dlm_ctl_fops = {
1826 +      .open    = dlm_ctl_open,
1827 +      .release = dlm_ctl_close,
1828 +      .ioctl   = dlm_ctl_ioctl,
1829 +      .owner   = THIS_MODULE,
1830 +};
1831 +
1832 +/*
1833 + * Create control device
1834 + */
1835 +int dlm_device_init(void)
1836 +{
1837 +       int r;
1838 +
1839 +       INIT_LIST_HEAD(&user_ls_list);
1840 +
1841 +       ctl_device.name = "dlm-control";
1842 +       ctl_device.fops = &_dlm_ctl_fops;
1843 +       ctl_device.minor = MISC_DYNAMIC_MINOR;
1844 +
1845 +       r = misc_register(&ctl_device);
1846 +       if (r) {
1847 +               log_print("misc_register failed for DLM control device");
1848 +               return r;
1849 +       }
1850 +
1851 +       return 0;
1852 +}
1853 +
1854 +void dlm_device_exit(void)
1855 +{
1856 +       misc_deregister(&ctl_device);
1857 +}
1858 +
1859 +/*
1860 + * Overrides for Emacs so that we follow Linus's tabbing style.
1861 + * Emacs will notice this stuff at the end of the file and automatically
1862 + * adjust the settings for this buffer only.  This must remain at the end
1863 + * of the file.
1864 + * ---------------------------------------------------------------------------
1865 + * Local variables:
1866 + * c-file-style: "linux"
1867 + * End:
1868 + */
1869 diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
1870 --- linux-orig/cluster/dlm/device.h     1970-01-01 07:30:00.000000000 +0730
1871 +++ linux-patched/cluster/dlm/device.h  2004-06-25 18:31:07.000000000 +0800
1872 @@ -0,0 +1,19 @@
1873 +/******************************************************************************
1874 +*******************************************************************************
1875 +**
1876 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
1877 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
1878 +**
1879 +**  This copyrighted material is made available to anyone wishing to use,
1880 +**  modify, copy, or redistribute it subject to the terms and conditions
1881 +**  of the GNU General Public License v.2.
1882 +**
1883 +*******************************************************************************
1884 +******************************************************************************/
1885 +
1886 +#ifndef __DEVICE_DOT_H__
1887 +#define __DEVICE_DOT_H__
1888 +
1889 +extern void dlm_device_free_devices(void);
1890 +
1891 +#endif                         /* __DEVICE_DOT_H__ */
1892 diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
1893 --- linux-orig/cluster/dlm/dir.c        1970-01-01 07:30:00.000000000 +0730
1894 +++ linux-patched/cluster/dlm/dir.c     2004-06-25 18:31:07.000000000 +0800
1895 @@ -0,0 +1,430 @@
1896 +/******************************************************************************
1897 +*******************************************************************************
1898 +**
1899 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
1900 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
1901 +**
1902 +**  This copyrighted material is made available to anyone wishing to use,
1903 +**  modify, copy, or redistribute it subject to the terms and conditions
1904 +**  of the GNU General Public License v.2.
1905 +**
1906 +*******************************************************************************
1907 +******************************************************************************/
1908 +
1909 +#include "dlm_internal.h"
1910 +#include "nodes.h"
1911 +#include "lockspace.h"
1912 +#include "lowcomms.h"
1913 +#include "reccomms.h"
1914 +#include "rsb.h"
1915 +#include "config.h"
1916 +#include "memory.h"
1917 +#include "recover.h"
1918 +#include "util.h"
1919 +
1920 +/*
1921 + * We use the upper 16 bits of the hash value to select the directory node.
1922 + * Low bits are used for distribution of rsb's among hash buckets on each node.
1923 + *
1924 + * From the hash value, we are interested in arriving at a final value between
1925 + * zero and the number of nodes minus one (num_nodes - 1).
1926 + *
1927 + * To accomplish this scaling, we take the nearest power of two larger than
1928 + * num_nodes and subtract one to create a bit mask.  The mask is applied to the
1929 + * hash, reducing the range to nearer the final range.
1930 + *
1931 + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
1932 + * num_nodes to the previously masked hash value.
1933 + *
1934 + * This value in the desired range is used as an offset into the sorted list of
1935 + * nodeid's to give the particular nodeid of the directory node.
1936 + */
1937 +
1938 +uint32_t name_to_directory_nodeid(gd_ls_t *ls, char *name, int length)
1939 +{
1940 +       struct list_head *tmp;
1941 +       gd_csb_t *csb = NULL;
1942 +       uint32_t hash, node, n = 0, nodeid;
1943 +
1944 +       if (ls->ls_num_nodes == 1) {
1945 +               nodeid = our_nodeid();
1946 +               goto out;
1947 +       }
1948 +
1949 +       hash = gdlm_hash(name, length);
1950 +       node = (hash >> 16) & ls->ls_nodes_mask;
1951 +       node %= ls->ls_num_nodes;
1952 +
1953 +       list_for_each(tmp, &ls->ls_nodes) {
1954 +               if (n++ != node)
1955 +                       continue;
1956 +               csb = list_entry(tmp, gd_csb_t, csb_list);
1957 +               break;
1958 +       }
1959 +
1960 +       GDLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
1961 +                               ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
1962 +       nodeid = csb->csb_node->gn_nodeid;
1963 +
1964 +      out:
1965 +       return nodeid;
1966 +}
1967 +
1968 +uint32_t get_directory_nodeid(gd_res_t *rsb)
1969 +{
1970 +       return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
1971 +                                       rsb->res_length);
1972 +}
1973 +
1974 +static inline uint32_t rd_hash(gd_ls_t *ls, char *name, int len)
1975 +{
1976 +       uint32_t val;
1977 +
1978 +       val = gdlm_hash(name, len);
1979 +       val &= RESDIRHASH_MASK;
1980 +
1981 +       return val;
1982 +}
1983 +
1984 +static void add_resdata_to_hash(gd_ls_t *ls, gd_resdata_t *rd)
1985 +{
1986 +       gd_resdir_bucket_t *bucket;
1987 +       uint32_t hashval;
1988 +
1989 +       hashval = rd_hash(ls, rd->rd_name, rd->rd_length);
1990 +       bucket = &ls->ls_resdir_hash[hashval];
1991 +
1992 +       list_add_tail(&rd->rd_list, &bucket->rb_reslist);
1993 +}
1994 +
1995 +static gd_resdata_t *search_rdbucket(gd_ls_t *ls, char *name, int namelen,
1996 +                                    uint32_t bucket)
1997 +{
1998 +       struct list_head *head;
1999 +       gd_resdata_t *rd;
2000 +
2001 +       head = &ls->ls_resdir_hash[bucket].rb_reslist;
2002 +       list_for_each_entry(rd, head, rd_list) {
2003 +               if (rd->rd_length == namelen &&
2004 +                   !memcmp(name, rd->rd_name, namelen))
2005 +                       goto out;
2006 +       }
2007 +       rd = NULL;
2008 +      out:
2009 +       return rd;
2010 +}
2011 +
2012 +void remove_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
2013 +                   uint8_t sequence)
2014 +{
2015 +       gd_resdata_t *rd;
2016 +       uint32_t bucket;
2017 +
2018 +       bucket = rd_hash(ls, name, namelen);
2019 +
2020 +       write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2021 +
2022 +       rd = search_rdbucket(ls, name, namelen, bucket);
2023 +
2024 +       if (!rd) {
2025 +               log_debug(ls, "remove_resdata not found nodeid=%u", nodeid);
2026 +               goto out;
2027 +       }
2028 +
2029 +       if (rd->rd_master_nodeid != nodeid) {
2030 +               log_debug(ls, "remove_resdata wrong nodeid=%u", nodeid);
2031 +               goto out;
2032 +       }
2033 +
2034 +       if (rd->rd_sequence == sequence) {
2035 +               list_del(&rd->rd_list);
2036 +               free_resdata(rd);
2037 +       } else {
2038 +               /*
2039 +               log_debug(ls, "remove_resdata mismatch nodeid=%u rd=%u in=%u",
2040 +                         nodeid, rd->rd_sequence, sequence);
2041 +               */
2042 +       }
2043 +
2044 +      out:
2045 +       write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2046 +}
2047 +
2048 +void resdir_clear(gd_ls_t *ls)
2049 +{
2050 +       struct list_head *head;
2051 +       gd_resdata_t *rd;
2052 +       int i;
2053 +
2054 +       for (i = 0; i < RESDIRHASH_SIZE; i++) {
2055 +               head = &ls->ls_resdir_hash[i].rb_reslist;
2056 +               while (!list_empty(head)) {
2057 +                       rd = list_entry(head->next, gd_resdata_t, rd_list);
2058 +                       list_del(&rd->rd_list);
2059 +                       free_resdata(rd);
2060 +               }
2061 +       }
2062 +}
2063 +
2064 +static void gdlm_resmov_in(gd_resmov_t *rm, char *buf)
2065 +{
2066 +       gd_resmov_t tmp;
2067 +
2068 +       memcpy(&tmp, buf, sizeof(gd_resmov_t));
2069 +
2070 +       rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2071 +       rm->rm_length = be16_to_cpu(tmp.rm_length);
2072 +}
2073 +
2074 +int resdir_rebuild_local(gd_ls_t *ls)
2075 +{
2076 +       gd_csb_t *csb;
2077 +       gd_resdata_t *rd;
2078 +       gd_rcom_t *rc;
2079 +       gd_resmov_t mov, last_mov;
2080 +       char *b, *last_name;
2081 +       int error = -ENOMEM, count = 0;
2082 +
2083 +       log_all(ls, "rebuild resource directory");
2084 +
2085 +       resdir_clear(ls);
2086 +
2087 +       rc = allocate_rcom_buffer(ls);
2088 +       if (!rc)
2089 +               goto out;
2090 +
2091 +       last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2092 +       if (!last_name)
2093 +               goto free_rc;
2094 +
2095 +       list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
2096 +               last_mov.rm_length = 0;
2097 +               for (;;) {
2098 +                       error = gdlm_recovery_stopped(ls);
2099 +                       if (error)
2100 +                               goto free_last;
2101 +
2102 +                       memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2103 +                       rc->rc_datalen = last_mov.rm_length;
2104 +
2105 +                       error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
2106 +                                                 RECCOMM_RECOVERNAMES, rc, 1);
2107 +                       if (error)
2108 +                               goto free_last;
2109 +
2110 +                       schedule();
2111 +
2112 +                       /*
2113 +                        * pick each res out of buffer
2114 +                        */
2115 +
2116 +                       b = rc->rc_buf;
2117 +
2118 +                       for (;;) {
2119 +                               gdlm_resmov_in(&mov, b);
2120 +                               b += sizeof(gd_resmov_t);
2121 +
2122 +                               /* Length of 0 with a non-zero nodeid marks the
2123 +                                * end of the list */
2124 +                               if (!mov.rm_length && mov.rm_nodeid)
2125 +                                       goto done;
2126 +
2127 +                               /* This is just the end of the block */
2128 +                               if (!mov.rm_length)
2129 +                                       break;
2130 +
2131 +                               error = -ENOMEM;
2132 +                               rd = allocate_resdata(ls, mov.rm_length);
2133 +                               if (!rd)
2134 +                                       goto free_last;
2135 +
2136 +                               rd->rd_master_nodeid = mov.rm_nodeid;
2137 +                               rd->rd_length = mov.rm_length;
2138 +                               rd->rd_sequence = 1;
2139 +
2140 +                               memcpy(rd->rd_name, b, mov.rm_length);
2141 +                               b += mov.rm_length;
2142 +
2143 +                               add_resdata_to_hash(ls, rd);
2144 +                               count++;
2145 +
2146 +                               last_mov = mov;
2147 +                               memset(last_name, 0, DLM_RESNAME_MAXLEN);
2148 +                               memcpy(last_name, rd->rd_name, rd->rd_length);
2149 +                       }
2150 +               }
2151 +             done:
2152 +               ;
2153 +       }
2154 +
2155 +       set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2156 +       error = 0;
2157 +
2158 +       log_all(ls, "rebuilt %d resources", count);
2159 +
2160 +      free_last:
2161 +       kfree(last_name);
2162 +
2163 +      free_rc:
2164 +       free_rcom_buffer(rc);
2165 +
2166 +      out:
2167 +       return error;
2168 +}
2169 +
2170 +/*
2171 + * The reply end of resdir_rebuild_local/RECOVERNAMES.  Collect and send as
2172 + * many resource names as can fit in the buffer.
2173 + */
2174 +
2175 +int resdir_rebuild_send(gd_ls_t *ls, char *inbuf, int inlen, char *outbuf,
2176 +                       int outlen, uint32_t nodeid)
2177 +{
2178 +       struct list_head *list;
2179 +       gd_res_t *start_rsb = NULL, *rsb;
2180 +       int offset = 0, start_namelen, error;
2181 +       char *start_name;
2182 +       gd_resmov_t tmp;
2183 +       uint32_t dir_nodeid;
2184 +
2185 +       /*
2186 +        * Find the rsb where we left off (or start again)
2187 +        */
2188 +
2189 +       start_namelen = inlen;
2190 +       start_name = inbuf;
2191 +
2192 +       if (start_namelen > 1) {
2193 +               error = find_or_create_rsb(ls, NULL, start_name,
2194 +                                          start_namelen, 0, &start_rsb);
2195 +               GDLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
2196 +               release_rsb(start_rsb);
2197 +       }
2198 +
2199 +       /*
2200 +        * Send rsb names for rsb's we're master of and whose directory node
2201 +        * matches the requesting node.
2202 +        */
2203 +
2204 +       down_read(&ls->ls_rec_rsblist);
2205 +       if (start_rsb)
2206 +               list = start_rsb->res_rootlist.next;
2207 +       else
2208 +               list = ls->ls_rootres.next;
2209 +
2210 +       for (offset = 0; list != &ls->ls_rootres; list = list->next) {
2211 +               rsb = list_entry(list, gd_res_t, res_rootlist);
2212 +               if (rsb->res_nodeid)
2213 +                       continue;
2214 +
2215 +               dir_nodeid = get_directory_nodeid(rsb);
2216 +               if (dir_nodeid != nodeid)
2217 +                       continue;
2218 +
2219 +               if (offset + sizeof(gd_resmov_t)*2 + rsb->res_length > outlen) {
2220 +                       /* Write end-of-block record */
2221 +                       memset(&tmp, 0, sizeof(gd_resmov_t));
2222 +                       memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2223 +                       offset += sizeof(gd_resmov_t);
2224 +                       goto out;
2225 +               }
2226 +
2227 +               memset(&tmp, 0, sizeof(gd_resmov_t));
2228 +               tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2229 +               tmp.rm_length = cpu_to_be16(rsb->res_length);
2230 +
2231 +               memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2232 +               offset += sizeof(gd_resmov_t);
2233 +
2234 +               memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2235 +               offset += rsb->res_length;
2236 +       }
2237 +
2238 +       /*
2239 +        * If we've reached the end of the list (and there's room) write a
2240 +        * terminating record.
2241 +        */
2242 +
2243 +       if ((list == &ls->ls_rootres) &&
2244 +           (offset + sizeof(gd_resmov_t) <= outlen)) {
2245 +
2246 +               memset(&tmp, 0, sizeof(gd_resmov_t));
2247 +               /* This only needs to be non-zero */
2248 +               tmp.rm_nodeid = cpu_to_be32(1);
2249 +               /* and this must be zero */
2250 +               tmp.rm_length = 0;
2251 +               memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2252 +               offset += sizeof(gd_resmov_t);
2253 +       }
2254 +
2255 + out:
2256 +       up_read(&ls->ls_rec_rsblist);
2257 +       return offset;
2258 +}
2259 +
2260 +int get_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
2261 +               gd_resdata_t **rdp, int recovery)
2262 +{
2263 +       gd_resdata_t *rd;
2264 +       gd_resdata_t *tmp;
2265 +       uint32_t bucket;
2266 +
2267 +       bucket = rd_hash(ls, name, namelen);
2268 +
2269 +       read_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2270 +       rd = search_rdbucket(ls, name, namelen, bucket);
2271 +       read_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2272 +
2273 +       if (rd)
2274 +               goto out;
2275 +
2276 +       rd = allocate_resdata(ls, namelen);
2277 +       if (!rd)
2278 +               return -ENOMEM;
2279 +
2280 +       rd->rd_master_nodeid = nodeid;
2281 +       rd->rd_length = namelen;
2282 +       memcpy(rd->rd_name, name, namelen);
2283 +
2284 +       write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2285 +       tmp = search_rdbucket(ls, name, namelen, bucket);
2286 +       if (!tmp)
2287 +               list_add_tail(&rd->rd_list,
2288 +                             &ls->ls_resdir_hash[bucket].rb_reslist);
2289 +       write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2290 +
2291 +       if (tmp) {
2292 +               free_resdata(rd);
2293 +               rd = tmp;
2294 +       }
2295 +
2296 +      out:
2297 +       *rdp = rd;
2298 +
2299 +       if (!recovery) {
2300 +               if (++rd->rd_sequence == 0)
2301 +                       rd->rd_sequence++;
2302 +       } else
2303 +               rd->rd_sequence = 1;
2304 +
2305 +       return 0;
2306 +}
2307 +
2308 +/*
2309 + * The node with lowest id queries all nodes to determine when all are done.
2310 + * All other nodes query the low nodeid for this.
2311 + */
2312 +
2313 +int resdir_rebuild_wait(gd_ls_t *ls)
2314 +{
2315 +       int error;
2316 +
2317 +       if (ls->ls_low_nodeid == our_nodeid()) {
2318 +               error = gdlm_wait_status_all(ls, RESDIR_VALID);
2319 +               if (!error)
2320 +                       set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2321 +       } else
2322 +               error = gdlm_wait_status_low(ls, RESDIR_ALL_VALID);
2323 +
2324 +       return error;
2325 +}
2326 diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2327 --- linux-orig/cluster/dlm/dir.h        1970-01-01 07:30:00.000000000 +0730
2328 +++ linux-patched/cluster/dlm/dir.h     2004-06-25 18:31:07.000000000 +0800
2329 @@ -0,0 +1,30 @@
2330 +/******************************************************************************
2331 +*******************************************************************************
2332 +**
2333 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2334 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2335 +**
2336 +**  This copyrighted material is made available to anyone wishing to use,
2337 +**  modify, copy, or redistribute it subject to the terms and conditions
2338 +**  of the GNU General Public License v.2.
2339 +**
2340 +*******************************************************************************
2341 +******************************************************************************/
2342 +
2343 +#ifndef __DIR_DOT_H__
2344 +#define __DIR_DOT_H__
2345 +
2346 +uint32_t name_to_directory_nodeid(gd_ls_t * ls, char *name, int length);
2347 +uint32_t get_directory_nodeid(gd_res_t * rsb);
2348 +void remove_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
2349 +                   uint8_t sequence);
2350 +int resdir_rebuild_local(gd_ls_t * ls);
2351 +int resdir_rebuild_send(gd_ls_t * ls, char *inbuf, int inlen, char *outbuf,
2352 +                       int outlen, uint32_t nodeid);
2353 +int get_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
2354 +               gd_resdata_t ** rdp, int recovery);
2355 +int resdir_rebuild_wait(gd_ls_t * ls);
2356 +void resdir_clear(gd_ls_t * ls);
2357 +void resdir_dump(gd_ls_t * ls);
2358 +
2359 +#endif                         /* __DIR_DOT_H__ */
2360 diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2361 --- linux-orig/cluster/dlm/dlm_internal.h       1970-01-01 07:30:00.000000000 +0730
2362 +++ linux-patched/cluster/dlm/dlm_internal.h    2004-06-25 18:31:07.000000000 +0800
2363 @@ -0,0 +1,634 @@
2364 +/******************************************************************************
2365 +*******************************************************************************
2366 +**
2367 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2368 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2369 +**
2370 +**  This copyrighted material is made available to anyone wishing to use,
2371 +**  modify, copy, or redistribute it subject to the terms and conditions
2372 +**  of the GNU General Public License v.2.
2373 +**
2374 +*******************************************************************************
2375 +******************************************************************************/
2376 +
2377 +#ifndef __DLM_INTERNAL_DOT_H__
2378 +#define __DLM_INTERNAL_DOT_H__
2379 +
2380 +/*
2381 + * This is the main header file to be included in each DLM source file.
2382 + */
2383 +
2384 +#define DLM_RELEASE_NAME "<CVS>"
2385 +
2386 +#include <linux/slab.h>
2387 +#include <linux/sched.h>
2388 +#include <asm/semaphore.h>
2389 +#include <linux/types.h>
2390 +#include <linux/spinlock.h>
2391 +#include <linux/vmalloc.h>
2392 +#include <asm/uaccess.h>
2393 +#include <linux/list.h>
2394 +#include <linux/errno.h>
2395 +#include <linux/random.h>
2396 +
2397 +#include <cluster/dlm.h>
2398 +#include <cluster/dlm_device.h>
2399 +#include <cluster/service.h>
2400 +
2401 +#ifndef TRUE
2402 +#define TRUE (1)
2403 +#endif
2404 +
2405 +#ifndef FALSE
2406 +#define FALSE (0)
2407 +#endif
2408 +
2409 +#if (BITS_PER_LONG == 64)
2410 +#define PRIu64 "lu"
2411 +#define PRId64 "ld"
2412 +#define PRIo64 "lo"
2413 +#define PRIx64 "lx"
2414 +#define PRIX64 "lX"
2415 +#define SCNu64 "lu"
2416 +#define SCNd64 "ld"
2417 +#define SCNo64 "lo"
2418 +#define SCNx64 "lx"
2419 +#define SCNX64 "lX"
2420 +#else
2421 +#define PRIu64 "Lu"
2422 +#define PRId64 "Ld"
2423 +#define PRIo64 "Lo"
2424 +#define PRIx64 "Lx"
2425 +#define PRIX64 "LX"
2426 +#define SCNu64 "Lu"
2427 +#define SCNd64 "Ld"
2428 +#define SCNo64 "Lo"
2429 +#define SCNx64 "Lx"
2430 +#define SCNX64 "LX"
2431 +#endif
2432 +
2433 +#define wchan_cond_sleep_intr(chan, sleep_cond) \
2434 +do \
2435 +{ \
2436 +  DECLARE_WAITQUEUE(__wait_chan, current); \
2437 +  current->state = TASK_INTERRUPTIBLE; \
2438 +  add_wait_queue(&chan, &__wait_chan); \
2439 +  if ((sleep_cond)) \
2440 +    schedule(); \
2441 +  remove_wait_queue(&chan, &__wait_chan); \
2442 +  current->state = TASK_RUNNING; \
2443 +} \
2444 +while (0)
2445 +
2446 +static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2447 +{
2448 +    return time_after(jiffies, stamp + seconds * HZ);
2449 +}
2450 +
2451 +
2452 +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2453 +
2454 +#define log_all(ls, fmt, args...) \
2455 +       do { \
2456 +               printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2457 +               dlm_debug_log(ls, fmt, ##args); \
2458 +       } while (0)
2459 +
2460 +#define log_error log_all
2461 +
2462 +
2463 +#define DLM_DEBUG
2464 +#if defined(DLM_DEBUG)
2465 +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2466 +#else
2467 +#define log_debug(ls, fmt, args...)
2468 +#endif
2469 +
2470 +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2471 +#undef log_debug
2472 +#define log_debug log_all
2473 +#endif
2474 +
2475 +
2476 +#define GDLM_ASSERT(x, do) \
2477 +{ \
2478 +  if (!(x)) \
2479 +  { \
2480 +    dlm_debug_dump(); \
2481 +    printk("\nDLM:  Assertion failed on line %d of file %s\n" \
2482 +               "DLM:  assertion:  \"%s\"\n" \
2483 +               "DLM:  time = %lu\n", \
2484 +               __LINE__, __FILE__, #x, jiffies); \
2485 +    {do} \
2486 +    printk("\n"); \
2487 +    BUG(); \
2488 +    panic("DLM:  Record message above and reboot.\n"); \
2489 +  } \
2490 +}
2491 +
2492 +
2493 +struct gd_ls;
2494 +struct gd_lkb;
2495 +struct gd_res;
2496 +struct gd_csb;
2497 +struct gd_node;
2498 +struct gd_resmov;
2499 +struct gd_resdata;
2500 +struct gd_recover;
2501 +struct gd_recinfo;
2502 +struct gd_resdir_bucket;
2503 +struct gd_remlockreply;
2504 +struct gd_remlockrequest;
2505 +struct gd_rcom;
2506 +
2507 +typedef struct gd_ls gd_ls_t;
2508 +typedef struct gd_lkb gd_lkb_t;
2509 +typedef struct gd_res gd_res_t;
2510 +typedef struct gd_csb gd_csb_t;
2511 +typedef struct gd_node gd_node_t;
2512 +typedef struct gd_resmov gd_resmov_t;
2513 +typedef struct gd_resdata gd_resdata_t;
2514 +typedef struct gd_recover gd_recover_t;
2515 +typedef struct gd_resdir_bucket gd_resdir_bucket_t;
2516 +typedef struct gd_rcom gd_rcom_t;
2517 +
2518 +/*
2519 + * Resource Data - an entry for a resource in the resdir hash table
2520 + */
2521 +
2522 +struct gd_resdata {
2523 +       struct list_head rd_list;
2524 +       uint32_t rd_master_nodeid;
2525 +       uint16_t rd_length;
2526 +       uint8_t rd_sequence;
2527 +       char rd_name[1];        /* <rd_length> bytes */
2528 +};
2529 +
2530 +/*
2531 + * Resource Directory Bucket - a hash bucket of resdata entries in the resdir
2532 + * hash table
2533 + */
2534 +
2535 +struct gd_resdir_bucket {
2536 +       struct list_head rb_reslist;
2537 +       rwlock_t rb_lock;
2538 +};
2539 +
2540 +/*
2541 + * A resource description as moved between nodes
2542 + */
2543 +
2544 +struct gd_resmov {
2545 +       uint32_t rm_nodeid;
2546 +       uint16_t rm_length;
2547 +       uint16_t rm_pad;
2548 +};
2549 +
2550 +/*
2551 + * An entry in the lock ID table.  Locks for this bucket are kept on list.
2552 + * Counter is used to assign an id to locks as they are added to this bucket.
2553 + */
2554 +
2555 +struct gd_lockidtbl_entry {
2556 +       struct list_head list;
2557 +       uint16_t counter;
2558 +};
2559 +
2560 +/* Elements in the range array */
2561 +
2562 +#define GR_RANGE_START 0
2563 +#define GR_RANGE_END   1
2564 +#define RQ_RANGE_START 2
2565 +#define RQ_RANGE_END   3
2566 +
2567 +/*
2568 + * Lockspace structure.  The context for GDLM locks.
2569 + */
2570 +
2571 +#define RESHASHTBL_SIZE     (256)
2572 +
2573 +#define RESDIRHASH_SHIFT    (9)
2574 +#define RESDIRHASH_SIZE     (1 << RESDIRHASH_SHIFT)
2575 +#define RESDIRHASH_MASK     (RESDIRHASH_SIZE - 1)
2576 +
2577 +#define LSFL_WORK               (0)
2578 +#define LSFL_LS_RUN             (1)
2579 +#define LSFL_LS_STOP            (2)
2580 +#define LSFL_LS_START           (3)
2581 +#define LSFL_LS_FINISH          (4)
2582 +#define LSFL_RECCOMM_WAIT       (5)
2583 +#define LSFL_RECCOMM_READY      (6)
2584 +#define LSFL_NOTIMERS           (7)
2585 +#define LSFL_FINISH_RECOVERY    (8)
2586 +#define LSFL_RESDIR_VALID       (9)
2587 +#define LSFL_ALL_RESDIR_VALID   (10)
2588 +#define LSFL_NODES_VALID        (11)
2589 +#define LSFL_ALL_NODES_VALID    (12)
2590 +#define LSFL_REQUEST_WARN       (13)
2591 +
2592 +#define LSST_NONE           (0)
2593 +#define LSST_INIT           (1)
2594 +#define LSST_INIT_DONE      (2)
2595 +#define LSST_CLEAR          (3)
2596 +#define LSST_WAIT_START     (4)
2597 +#define LSST_RECONFIG_DONE  (5)
2598 +
2599 +struct gd_ls {
2600 +       struct list_head ls_list;       /* list of lockspaces */
2601 +       uint32_t ls_local_id;   /* local unique lockspace ID */
2602 +       uint32_t ls_global_id;  /* global unique lockspace ID */
2603 +       int ls_allocation;      /* Memory allocation policy */
2604 +       unsigned long ls_flags; /* LSFL_ */
2605 +
2606 +       struct list_head ls_rootres;    /* List of root resources */
2607 +
2608 +       int ls_hashsize;
2609 +       int ls_hashmask;
2610 +       struct list_head *ls_reshashtbl;        /* Hash table for resources */
2611 +       rwlock_t ls_reshash_lock;       /* Lock for hash table */
2612 +
2613 +       struct gd_lockidtbl_entry *ls_lockidtbl;
2614 +       uint32_t ls_lockidtbl_size;     /* Size of lock id table */
2615 +       rwlock_t ls_lockidtbl_lock;
2616 +
2617 +       struct list_head ls_nodes;      /* current nodes in RC */
2618 +       uint32_t ls_num_nodes;  /* number of nodes in RC */
2619 +       uint32_t ls_nodes_mask;
2620 +       uint32_t ls_low_nodeid;
2621 +
2622 +       int ls_state;           /* state changes for recovery */
2623 +       struct list_head ls_recover;    /* gr_recover_t structs */
2624 +       int ls_last_stop;       /* event ids from sm */
2625 +       int ls_last_start;
2626 +       int ls_last_finish;
2627 +       spinlock_t ls_recover_lock;
2628 +       struct list_head ls_nodes_gone; /* dead node list for recovery */
2629 +
2630 +       wait_queue_head_t ls_wait_general;
2631 +
2632 +       gd_rcom_t *ls_rcom;
2633 +       uint32_t ls_rcom_msgid;
2634 +       struct semaphore ls_rcom_lock;
2635 +
2636 +       struct list_head ls_recover_list;
2637 +       int ls_recover_list_count;
2638 +       spinlock_t ls_recover_list_lock;
2639 +
2640 +       struct rw_semaphore ls_in_recovery;     /* held in write during
2641 +                                                * recovery, read for normal
2642 +                                                * locking ops */
2643 +       struct rw_semaphore ls_unlock_sem;      /* To prevent unlock on a
2644 +                                                * parent lock racing with a
2645 +                                                * new child lock */
2646 +
2647 +       struct rw_semaphore ls_rec_rsblist;     /* To prevent incoming recovery
2648 +                                                * operations happening while
2649 +                                                * we are purging */
2650 +
2651 +       struct rw_semaphore ls_gap_rsblist;     /* To protect rootres list
2652 +                                                * in grant_after_purge() which
2653 +                                                * runs outside recovery */
2654 +
2655 +       struct list_head ls_rebuild_rootrsb_list;       /* Root of lock trees
2656 +                                                        * we are deserialising
2657 +                                                        */
2658 +
2659 +       struct list_head ls_deadlockq;  /* List of locks in conversion ordered
2660 +                                        * by duetime. for deadlock detection */
2661 +
2662 +       struct list_head ls_requestqueue;       /* List of incoming requests
2663 +                                                * held while we are in
2664 +                                                * recovery */
2665 +
2666 +       gd_resdir_bucket_t ls_resdir_hash[RESDIRHASH_SIZE];
2667 +
2668 +       int ls_namelen;
2669 +       char ls_name[1];        /* <namelen> bytes */
2670 +};
2671 +
2672 +/*
2673 + * Cluster node (per node in cluster)
2674 + */
2675 +
2676 +struct gd_node {
2677 +       struct list_head gn_list;       /* global list of cluster nodes */
2678 +       uint32_t gn_nodeid;     /* cluster unique nodeid (cman) */
2679 +       uint32_t gn_ipaddr;     /* node's first IP address (cman) */
2680 +       int gn_refcount;        /* number of csb's referencing */
2681 +};
2682 +
2683 +/*
2684 + * Cluster System Block (per node in a ls)
2685 + */
2686 +
2687 +struct gd_csb {
2688 +       struct list_head csb_list;      /* per-lockspace list of nodes */
2689 +       gd_node_t *csb_node;    /* global node structure */
2690 +       int csb_gone_event;     /* event id when node was removed */
2691 +
2692 +       uint32_t csb_names_send_count;
2693 +       uint32_t csb_names_send_msgid;
2694 +       uint32_t csb_names_recv_count;
2695 +       uint32_t csb_names_recv_msgid;
2696 +       uint32_t csb_locks_send_count;
2697 +       uint32_t csb_locks_send_msgid;
2698 +       uint32_t csb_locks_recv_count;
2699 +       uint32_t csb_locks_recv_msgid;
2700 +};
2701 +
2702 +/*
2703 + * Resource block
2704 + */
2705 +
2706 +/* status */
2707 +
2708 +#define GDLM_RESSTS_DIRENTRY     1     /* This is a directory entry */
2709 +#define GDLM_RESSTS_LVBINVALID   2     /* The LVB is invalid */
2710 +
2711 +#define RESFL_NEW_MASTER         (0)
2712 +#define RESFL_RECOVER_LIST       (1)
2713 +
2714 +struct gd_res {
2715 +       struct list_head res_hashchain; /* Chain of resources in this hash
2716 +                                        * bucket */
2717 +
2718 +       gd_ls_t *res_ls;        /* The owning lockspace */
2719 +
2720 +       struct list_head res_rootlist;  /* List of root resources in lockspace */
2721 +
2722 +       struct list_head res_subreslist;        /* List of all sub-resources
2723 +                                                * for this root res. */
2724 +       /* This is a list head on the root res and holds the whole tree below
2725 +        * it. */
2726 +       uint8_t res_depth;      /* Depth in resource tree */
2727 +       uint16_t res_status;
2728 +       unsigned long res_flags;        /* Flags, RESFL_ */
2729 +
2730 +       struct list_head res_grantqueue;
2731 +       struct list_head res_convertqueue;
2732 +       struct list_head res_waitqueue;
2733 +
2734 +       uint32_t res_nodeid;    /* nodeid of master node */
2735 +
2736 +       gd_res_t *res_root;     /* If a subresource, this is our root */
2737 +       gd_res_t *res_parent;   /* Our parent resource (if any) */
2738 +
2739 +       atomic_t res_ref;       /* No of lkb's */
2740 +       uint16_t res_remasterid;        /* ID used during remaster */
2741 +       struct list_head res_recover_list;      /* General list for use during
2742 +                                                * recovery */
2743 +       int res_recover_msgid;
2744 +       int res_newlkid_expect;
2745 +
2746 +       struct rw_semaphore res_lock;
2747 +
2748 +       char *res_lvbptr;       /* Lock value block */
2749 +
2750 +       uint8_t res_resdir_seq; /* Last directory sequence number */
2751 +
2752 +       uint8_t res_length;
2753 +       char res_name[1];       /* <res_length> bytes */
2754 +};
2755 +
2756 +/*
2757 + * Lock block. To avoid confusion, where flags mirror the
2758 + * public flags, they should have the same value.
2759 + */
2760 +
2761 +#define GDLM_LKSTS_NEW          (0)
2762 +#define GDLM_LKSTS_WAITING      (1)
2763 +#define GDLM_LKSTS_GRANTED      (2)
2764 +#define GDLM_LKSTS_CONVERT      (3)
2765 +
2766 +#define GDLM_LKFLG_VALBLK       (0x00000008)
2767 +#define GDLM_LKFLG_PERSISTENT   (0x00000080)   /* Don't unlock when process exits */
2768 +#define GDLM_LKFLG_NODLCKWT     (0x00000100)       /* Don't do deadlock detection */
2769 +#define GDLM_LKFLG_EXPEDITE     (0x00000400)       /* Move to head of convert queue */
2770 +
2771 +/* Internal flags */
2772 +#define GDLM_LKFLG_RANGE        (0x00001000)   /* Range field is present (remote protocol only) */
2773 +#define GDLM_LKFLG_MSTCPY       (0x00002000)
2774 +#define GDLM_LKFLG_DELETED      (0x00004000)   /* LKB is being deleted */
2775 +#define GDLM_LKFLG_DELAST       (0x00008000)   /* Delete after delivering AST */
2776 +#define GDLM_LKFLG_LQRESEND     (0x00010000)   /* LKB on lockqueue must be resent */
2777 +#define GDLM_LKFLG_DEMOTED      (0x00020000)
2778 +#define GDLM_LKFLG_RESENT       (0x00040000)
2779 +#define GDLM_LKFLG_NOREBUILD    (0x00080000)
2780 +#define GDLM_LKFLG_LQCONVERT    (0x00100000)
2781 +
2782 +struct gd_lkb {
2783 +       void *lkb_astaddr;
2784 +       void *lkb_bastaddr;
2785 +       long lkb_astparam;
2786 +
2787 +       uint32_t lkb_flags;
2788 +       uint16_t lkb_status;    /* LKSTS_ granted, waiting, converting */
2789 +       int8_t lkb_rqmode;      /* Requested lock mode */
2790 +       int8_t lkb_grmode;      /* Granted lock mode */
2791 +       uint8_t lkb_bastmode;   /* Requested mode returned in bast */
2792 +       uint8_t lkb_highbast;   /* Highest mode we have sent a BAST for */
2793 +       uint32_t lkb_retstatus; /* Status to return in lksb */
2794 +
2795 +       uint32_t lkb_id;        /* Our lock ID */
2796 +       struct dlm_lksb *lkb_lksb;      /* Lock status block of caller */
2797 +       struct list_head lkb_idtbl_list;        /* list pointer into the
2798 +                                                * lockidtbl */
2799 +
2800 +       struct list_head lkb_statequeue;        /* List of locks in this state */
2801 +
2802 +       struct list_head lkb_ownerqueue;        /* List of locks owned by a
2803 +                                                * process */
2804 +
2805 +       gd_lkb_t *lkb_parent;   /* Pointer to parent if any */
2806 +
2807 +       atomic_t lkb_childcnt;  /* Number of children */
2808 +
2809 +       struct list_head lkb_lockqueue; /* For when we are on the lock queue */
2810 +       int lkb_lockqueue_state;
2811 +       int lkb_lockqueue_flags;        /* As passed into lock/unlock */
2812 +       unsigned long lkb_lockqueue_time;       /* Time we went on the lock
2813 +                                                * queue */
2814 +
2815 +       gd_res_t *lkb_resource;
2816 +
2817 +       unsigned long lkb_duetime;      /* For deadlock detection */
2818 +
2819 +       uint32_t lkb_remid;     /* Remote partner */
2820 +       uint32_t lkb_nodeid;
2821 +
2822 +       struct list_head lkb_astqueue;  /* For when we are on the AST queue */
2823 +       uint32_t lkb_asts_to_deliver;
2824 +
2825 +       struct gd_remlockrequest *lkb_request;
2826 +
2827 +       struct list_head lkb_deadlockq; /* on ls_deadlockq list */
2828 +
2829 +       char *lkb_lvbptr;       /* Points to lksb on a local lock, allocated
2830 +                                * LVB (if necessary) on a remote lock */
2831 +       uint64_t *lkb_range;    /* Points to an array of 64 bit numbers that
2832 +                                * represent the requested and granted ranges
2833 +                                * of the lock. NULL implies 0-ffffffffffffffff
2834 +                                */
2835 +};
2836 +
2837 +/*
2838 + * Used to save and manage recovery state for a lockspace.
2839 + */
2840 +
2841 +struct gd_recover {
2842 +       struct list_head gr_list;
2843 +       uint32_t *gr_nodeids;
2844 +       int gr_node_count;
2845 +       int gr_event_id;
2846 +};
2847 +
2848 +/*
2849 + * Header part of the mid-level comms system. All packets start with
2850 + * this header so we can identify them. The comms packet can
2851 + * contain many of these structs but the are split into individual
2852 + * work units before being passed to the lockqueue routines.
2853 + * below this are the structs that this is a header for
2854 + */
2855 +
2856 +struct gd_req_header {
2857 +       uint8_t rh_cmd;         /* What we are */
2858 +       uint8_t rh_flags;       /* maybe just a pad */
2859 +       uint16_t rh_length;     /* Length of struct (so we can send several in
2860 +                                * one message) */
2861 +       uint32_t rh_lkid;       /* Lock ID tag: ie the local (requesting) lock
2862 +                                * ID */
2863 +       uint32_t rh_lockspace;  /* Lockspace ID */
2864 +};
2865 +
2866 +/*
2867 + * This is the struct used in a remote lock/unlock/convert request
2868 + * The mid-level comms API should turn this into native byte order.
2869 + * Most "normal" lock operations will use these two structs for
2870 + * communications. Recovery operations use their own structs
2871 + * but still with the gd_req_header on the front.
2872 + */
2873 +
2874 +struct gd_remlockrequest {
2875 +       struct gd_req_header rr_header;
2876 +
2877 +       uint32_t rr_remlkid;    /* Remote lock ID */
2878 +       uint32_t rr_remparid;   /* Parent's remote lock ID or 0 */
2879 +       uint32_t rr_flags;      /* Flags from lock/convert request */
2880 +        uint64_t rr_range_start;/* Yes, these are in the right place... */
2881 +       uint64_t rr_range_end;
2882 +       uint32_t rr_status;     /* Status to return if this is an AST request */
2883 +       uint8_t rr_rqmode;      /* Requested lock mode */
2884 +       uint8_t rr_asts;        /* Whether the LKB has ASTs or not */
2885 +       uint8_t rr_resdir_seq;  /* Directory sequence number */
2886 +       char rr_lvb[DLM_LVB_LEN];       /* Value block */
2887 +       char rr_name[1];        /* As long as needs be. Only used for directory
2888 +                                * lookups. The length of this can be worked
2889 +                                * out from the packet length */
2890 +};
2891 +
2892 +/*
2893 + * This is the struct returned by a remote lock/unlock/convert request
2894 + * The mid-level comms API should turn this into native byte order.
2895 + */
2896 +
2897 +struct gd_remlockreply {
2898 +       struct gd_req_header rl_header;
2899 +
2900 +       uint32_t rl_lockstate;  /* Whether request was queued/granted/waiting */
2901 +       uint32_t rl_nodeid;     /* nodeid of lock master */
2902 +       uint32_t rl_status;     /* Status to return to caller */
2903 +       uint32_t rl_lkid;       /* Remote lkid */
2904 +       uint8_t rl_resdir_seq;  /* Returned directory sequence number */
2905 +       char rl_lvb[DLM_LVB_LEN];       /* LVB itself */
2906 +};
2907 +
2908 +/*
2909 + * Recovery comms message
2910 + */
2911 +
2912 +struct gd_rcom {
2913 +       struct gd_req_header rc_header; /* 32 byte aligned */
2914 +       uint32_t rc_msgid;
2915 +       uint16_t rc_datalen;
2916 +       uint8_t rc_expanded;
2917 +       uint8_t rc_subcmd;      /* secondary command */
2918 +       char rc_buf[1];         /* first byte of data goes here and extends
2919 +                                * beyond here for another datalen - 1 bytes.
2920 +                                * rh_length is set to sizeof(gd_rcom_t) +
2921 +                                * datalen - 1 */
2922 +};
2923 +
2924 +
2925 +/* A remote query: GDLM_REMCMD_QUERY */
2926 +struct gd_remquery {
2927 +       struct gd_req_header rq_header;
2928 +
2929 +       uint32_t rq_mstlkid;   /* LockID on master node */
2930 +        uint32_t rq_query;     /* query from the user */
2931 +        uint32_t rq_maxlocks;  /* max number of locks we can cope with */
2932 +};
2933 +
2934 +/* First block of a reply query.  cmd = GDLM_REMCMD_QUERY */
2935 +/* There may be subsequent blocks of
2936 +   lock info in GDLM_REMCMD_QUERYCONT messages which just have
2937 +   a normal header. The last of these will have rh_flags set to
2938 +   GDLM_REMFLAG_ENDQUERY
2939 + */
2940 +struct gd_remqueryreply {
2941 +       struct gd_req_header rq_header;
2942 +
2943 +        uint32_t rq_numlocks;  /* Number of locks in reply */
2944 +        uint32_t rq_startlock; /* Which lock this block starts at (for multiple block replies) */
2945 +        uint32_t rq_status;
2946 +
2947 +        /* Resource information */
2948 +       uint32_t rq_grantcount; /* No. of nodes on grant queue */
2949 +       uint32_t rq_convcount;  /* No. of nodes on convert queue */
2950 +       uint32_t rq_waitcount;  /* No. of nodes on wait queue */
2951 +        char rq_valblk[DLM_LVB_LEN];   /* Master's LVB contents, if applicable */
2952 +};
2953 +
2954 +/*
2955 + * Lockqueue wait lock states
2956 + */
2957 +
2958 +#define GDLM_LQSTATE_WAIT_RSB       1
2959 +#define GDLM_LQSTATE_WAIT_CONVERT   2
2960 +#define GDLM_LQSTATE_WAIT_CONDGRANT 3
2961 +#define GDLM_LQSTATE_WAIT_UNLOCK    4
2962 +
2963 +/* Commands sent across the comms link */
2964 +#define GDLM_REMCMD_LOOKUP          1
2965 +#define GDLM_REMCMD_LOCKREQUEST     2
2966 +#define GDLM_REMCMD_UNLOCKREQUEST   3
2967 +#define GDLM_REMCMD_CONVREQUEST     4
2968 +#define GDLM_REMCMD_LOCKREPLY       5
2969 +#define GDLM_REMCMD_LOCKGRANT       6
2970 +#define GDLM_REMCMD_SENDBAST        7
2971 +#define GDLM_REMCMD_SENDCAST        8
2972 +#define GDLM_REMCMD_REM_RESDATA     9
2973 +#define GDLM_REMCMD_RECOVERMESSAGE  20
2974 +#define GDLM_REMCMD_RECOVERREPLY    21
2975 +#define GDLM_REMCMD_QUERY           30
2976 +#define GDLM_REMCMD_QUERYREPLY      31
2977 +
2978 +/* Set in rh_flags when this is the last block of
2979 +   query information. Note this could also be the first
2980 +   block */
2981 +#define GDLM_REMFLAG_ENDQUERY       1
2982 +
2983 +/*
2984 + * This is a both a parameter to queue_ast and also the bitmap of ASTs in
2985 + * lkb_asts_to_deliver
2986 + */
2987 +
2988 +typedef enum { GDLM_QUEUE_COMPAST = 1, GDLM_QUEUE_BLKAST = 2 } gd_ast_type_t;
2989 +
2990 +#ifndef BUG_ON
2991 +#define BUG_ON(x)
2992 +#endif
2993 +
2994 +void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...);
2995 +void dlm_debug_dump(void);
2996 +
2997 +#endif                         /* __DLM_INTERNAL_DOT_H__ */
2998 diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
2999 --- linux-orig/cluster/dlm/lkb.c        1970-01-01 07:30:00.000000000 +0730
3000 +++ linux-patched/cluster/dlm/lkb.c     2004-06-25 18:31:07.000000000 +0800
3001 @@ -0,0 +1,225 @@
3002 +/******************************************************************************
3003 +*******************************************************************************
3004 +**
3005 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3006 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3007 +**
3008 +**  This copyrighted material is made available to anyone wishing to use,
3009 +**  modify, copy, or redistribute it subject to the terms and conditions
3010 +**  of the GNU General Public License v.2.
3011 +**
3012 +*******************************************************************************
3013 +******************************************************************************/
3014 +
3015 +/*
3016 + * lkb.c
3017 + *
3018 + * Allocate and free locks on the lock ID table.
3019 + *
3020 + * This is slightly naff but I don't really like the
3021 + * VMS lockidtbl stuff as it uses a realloced array
3022 + * to hold the locks in. I think this is slightly better
3023 + * in some ways.
3024 + *
3025 + * Any better suggestions gratefully received. Patrick
3026 + *
3027 + */
3028 +
3029 +#include "dlm_internal.h"
3030 +#include "lockqueue.h"
3031 +#include "lkb.h"
3032 +#include "config.h"
3033 +#include "rsb.h"
3034 +#include "memory.h"
3035 +#include "lockspace.h"
3036 +#include "util.h"
3037 +
3038 +/*
3039 + * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
3040 + */
3041 +
3042 +static gd_lkb_t *__find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
3043 +{
3044 +       uint16_t entry = lkid & 0xFFFF;
3045 +       gd_lkb_t *lkb;
3046 +
3047 +       if (entry >= ls->ls_lockidtbl_size)
3048 +               goto out;
3049 +
3050 +       list_for_each_entry(lkb, &ls->ls_lockidtbl[entry].list, lkb_idtbl_list){
3051 +               if (lkb->lkb_id == lkid)
3052 +                       return lkb;
3053 +       }
3054 +
3055 +      out:
3056 +       return NULL;
3057 +}
3058 +
3059 +/*
3060 + * Should be called at lockspace initialisation time.
3061 + */
3062 +
3063 +int init_lockidtbl(gd_ls_t *ls, int entries)
3064 +{
3065 +       int i;
3066 +
3067 +       /* Make sure it's a power of two */
3068 +       GDLM_ASSERT(!(entries & (entries - 1)),);
3069 +
3070 +       ls->ls_lockidtbl_size = entries;
3071 +       rwlock_init(&ls->ls_lockidtbl_lock);
3072 +
3073 +       ls->ls_lockidtbl = kmalloc(entries * sizeof(struct gd_lockidtbl_entry),
3074 +                                  GFP_KERNEL);
3075 +       if (!ls->ls_lockidtbl)
3076 +               return -ENOMEM;
3077 +
3078 +       for (i = 0; i < entries; i++) {
3079 +               INIT_LIST_HEAD(&ls->ls_lockidtbl[i].list);
3080 +               ls->ls_lockidtbl[i].counter = 1;
3081 +       }
3082 +
3083 +       return 0;
3084 +}
3085 +
3086 +/*
3087 + * Free up the space - returns an error if there are still locks hanging around
3088 + */
3089 +
3090 +int free_lockidtbl(gd_ls_t *ls)
3091 +{
3092 +       int i;
3093 +
3094 +       write_lock(&ls->ls_lockidtbl_lock);
3095 +
3096 +       for (i = 0; i < ls->ls_lockidtbl_size; i++) {
3097 +               if (!list_empty(&ls->ls_lockidtbl[i].list)) {
3098 +                       write_unlock(&ls->ls_lockidtbl_lock);
3099 +                       return -1;
3100 +               }
3101 +       }
3102 +       kfree(ls->ls_lockidtbl);
3103 +
3104 +       write_unlock(&ls->ls_lockidtbl_lock);
3105 +
3106 +       return 0;
3107 +}
3108 +
3109 +/*
3110 + * LKB lkid's are 32 bits and have two 16 bit parts.  The bottom 16 bits are a
3111 + * random number between 0 and lockidtbl_size-1.  This random number specifies
3112 + * the "bucket" for the lkb in lockidtbl.  The upper 16 bits are a sequentially
3113 + * assigned per-bucket id.
3114 + *
3115 + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3116 + * against the lkid of all lkb's in the bucket to avoid duplication.
3117 + *
3118 + */
3119 +
3120 +gd_lkb_t *create_lkb(gd_ls_t *ls)
3121 +{
3122 +       gd_lkb_t *lkb;
3123 +       uint32_t lkid;
3124 +       uint16_t bucket;
3125 +
3126 +       lkb = allocate_lkb(ls);
3127 +       if (!lkb)
3128 +               goto out;
3129 +
3130 +       write_lock(&ls->ls_lockidtbl_lock);
3131 +       do {
3132 +               get_random_bytes(&bucket, sizeof(bucket));
3133 +               bucket &= (ls->ls_lockidtbl_size - 1);
3134 +               lkid = bucket | (ls->ls_lockidtbl[bucket].counter++ << 16);
3135 +       }
3136 +       while (__find_lock_by_id(ls, lkid));
3137 +
3138 +       lkb->lkb_id = (uint32_t) lkid;
3139 +       list_add(&lkb->lkb_idtbl_list, &ls->ls_lockidtbl[bucket].list);
3140 +       write_unlock(&ls->ls_lockidtbl_lock);
3141 +
3142 +      out:
3143 +       return lkb;
3144 +}
3145 +
3146 +/*
3147 + * Free LKB and remove it from the lockidtbl.
3148 + * NB - this always frees the lkb whereas release_rsb doesn't free an
3149 + * rsb unless its reference count is zero.
3150 + */
3151 +
3152 +void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb)
3153 +{
3154 +       if (lkb->lkb_status) {
3155 +               log_error(ls, "release lkb with status %u", lkb->lkb_status);
3156 +               print_lkb(lkb);
3157 +               return;
3158 +       }
3159 +
3160 +       if (lkb->lkb_parent)
3161 +               atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3162 +
3163 +       write_lock(&ls->ls_lockidtbl_lock);
3164 +       list_del(&lkb->lkb_idtbl_list);
3165 +       write_unlock(&ls->ls_lockidtbl_lock);
3166 +
3167 +       /* if this is not a master copy then lvbptr points into the user's
3168 +        * lksb, so don't free it */
3169 +       if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3170 +               free_lvb(lkb->lkb_lvbptr);
3171 +
3172 +       if (lkb->lkb_range)
3173 +               free_range(lkb->lkb_range);
3174 +
3175 +       free_lkb(lkb);
3176 +}
3177 +
3178 +gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
3179 +{
3180 +       gd_lkb_t *lkb;
3181 +
3182 +       read_lock(&ls->ls_lockidtbl_lock);
3183 +       lkb = __find_lock_by_id(ls, lkid);
3184 +       read_unlock(&ls->ls_lockidtbl_lock);
3185 +
3186 +       return lkb;
3187 +}
3188 +
3189 +gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid)
3190 +{
3191 +        gd_ls_t *lspace = find_lockspace_by_local_id(ls);
3192 +       return find_lock_by_id(lspace, lkid);
3193 +}
3194 +
3195 +/*
3196 + * Initialise the range parts of an LKB.
3197 + */
3198 +
3199 +int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end)
3200 +{
3201 +       int ret = -ENOMEM;
3202 +
3203 +       /*
3204 +        * if this wasn't already a range lock, make it one
3205 +        */
3206 +       if (!lkb->lkb_range) {
3207 +               lkb->lkb_range = allocate_range(lspace);
3208 +               if (!lkb->lkb_range)
3209 +                       goto out;
3210 +
3211 +               /*
3212 +                * This is needed for conversions that contain ranges where the
3213 +                * original lock didn't but it's harmless for new locks too.
3214 +                */
3215 +               lkb->lkb_range[GR_RANGE_START] = 0LL;
3216 +               lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3217 +       }
3218 +
3219 +       lkb->lkb_range[RQ_RANGE_START] = start;
3220 +       lkb->lkb_range[RQ_RANGE_END] = end;
3221 +
3222 +       ret = 0;
3223 +
3224 +      out:
3225 +       return ret;
3226 +}
3227 diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3228 --- linux-orig/cluster/dlm/lkb.h        1970-01-01 07:30:00.000000000 +0730
3229 +++ linux-patched/cluster/dlm/lkb.h     2004-06-25 18:31:07.000000000 +0800
3230 @@ -0,0 +1,27 @@
3231 +/******************************************************************************
3232 +*******************************************************************************
3233 +**
3234 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3235 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3236 +**
3237 +**  This copyrighted material is made available to anyone wishing to use,
3238 +**  modify, copy, or redistribute it subject to the terms and conditions
3239 +**  of the GNU General Public License v.2.
3240 +**
3241 +*******************************************************************************
3242 +******************************************************************************/
3243 +
3244 +#ifndef __LKB_DOT_H__
3245 +#define __LKB_DOT_H__
3246 +
3247 +int free_lockidtbl(gd_ls_t * lspace);
3248 +int init_lockidtbl(gd_ls_t * lspace, int entries);
3249 +
3250 +gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid);
3251 +gd_lkb_t *create_lkb(gd_ls_t *ls);
3252 +void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb);
3253 +gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid);
3254 +int verify_lkb_nodeids(gd_ls_t *ls);
3255 +int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end);
3256 +
3257 +#endif                         /* __LKB_DOT_H__ */
3258 diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3259 --- linux-orig/cluster/dlm/locking.c    1970-01-01 07:30:00.000000000 +0730
3260 +++ linux-patched/cluster/dlm/locking.c 2004-06-25 18:31:07.000000000 +0800
3261 @@ -0,0 +1,1225 @@
3262 +/******************************************************************************
3263 +*******************************************************************************
3264 +**
3265 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3266 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3267 +**
3268 +**  This copyrighted material is made available to anyone wishing to use,
3269 +**  modify, copy, or redistribute it subject to the terms and conditions
3270 +**  of the GNU General Public License v.2.
3271 +**
3272 +*******************************************************************************
3273 +******************************************************************************/
3274 +
3275 +/*
3276 + * locking.c
3277 + *
3278 + * This is where the main work of the DLM goes on
3279 + *
3280 + */
3281 +
3282 +#include "dlm_internal.h"
3283 +#include "lockqueue.h"
3284 +#include "locking.h"
3285 +#include "lockspace.h"
3286 +#include "lkb.h"
3287 +#include "nodes.h"
3288 +#include "dir.h"
3289 +#include "ast.h"
3290 +#include "memory.h"
3291 +#include "rsb.h"
3292 +
3293 +#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3294 +
3295 +/*
3296 + * Lock compatibilty matrix - thanks Steve
3297 + * UN = Unlocked state. Not really a state, used as a flag
3298 + * PD = Padding. Used to make the matrix a nice power of two in size
3299 + * Other states are the same as the VMS DLM.
3300 + * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
3301 + */
3302 +
3303 +#define modes_compat(gr, rq) \
3304 +       __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3305 +
3306 +const int __dlm_compat_matrix[8][8] = {
3307 +      /* UN NL CR CW PR PW EX PD */
3308 +       {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
3309 +       {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
3310 +       {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
3311 +       {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
3312 +       {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
3313 +       {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
3314 +       {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
3315 +       {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
3316 +};
3317 +
3318 +/*
3319 + * Compatibility matrix for conversions with QUECVT set.
3320 + * Granted mode is the row; requested mode is the column.
3321 + * Usage: matrix[grmode+1][rqmode+1]
3322 + */
3323 +
3324 +const int __quecvt_compat_matrix[8][8] = {
3325 +      /* UN NL CR CW PR PW EX PD */
3326 +       {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
3327 +       {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
3328 +       {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
3329 +       {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
3330 +       {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
3331 +       {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
3332 +       {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
3333 +       {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
3334 +};
3335 +
3336 +/*
3337 + * This defines the direction of transfer of LVB data.
3338 + * Granted mode is the row; requested mode is the column.
3339 + * Usage: matrix[grmode+1][rqmode+1]
3340 + * 1 = LVB is returned to the caller
3341 + * 0 = LVB is written to the resource
3342 + * -1 = nothing happens to the LVB
3343 + */
3344 +
3345 +const int __lvb_operations[8][8] = {
3346 +       /* UN   NL  CR  CW  PR  PW  EX  PD*/
3347 +       {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
3348 +       {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
3349 +       {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
3350 +       {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
3351 +       {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
3352 +       {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
3353 +       {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
3354 +       {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
3355 +};
3356 +
3357 +static void grant_lock(gd_lkb_t * lkb, int send_remote);
3358 +static void send_blocking_asts(gd_res_t * rsb, gd_lkb_t * lkb);
3359 +static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb);
3360 +static int convert_lock(gd_ls_t * ls, int mode, struct dlm_lksb *lksb,
3361 +                       int flags, void *ast, void *astarg, void *bast,
3362 +                       struct dlm_range *range);
3363 +static int dlm_lock_stage1(gd_ls_t * lspace, gd_lkb_t * lkb, int flags,
3364 +                          char *name, int namelen);
3365 +
3366 +
3367 +static inline int first_in_list(gd_lkb_t *lkb, struct list_head *head)
3368 +{
3369 +       gd_lkb_t *first = list_entry(head->next, gd_lkb_t, lkb_statequeue);
3370 +
3371 +       if (lkb->lkb_id == first->lkb_id)
3372 +               return 1;
3373 +
3374 +       return 0;
3375 +}
3376 +
3377 +/*
3378 + * Return 1 if the locks' ranges overlap
3379 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3380 + */
3381 +
3382 +static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
3383 +{
3384 +       if (!lkb1->lkb_range || !lkb2->lkb_range)
3385 +               return 1;
3386 +
3387 +       if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3388 +           lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3389 +               return 0;
3390 +
3391 +       return 1;
3392 +}
3393 +
3394 +/*
3395 + * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
3396 + * locks on the convert queue.  One of the deadlocked locks is allowed to
3397 + * retain its original granted state (we choose the lkb provided although it
3398 + * shouldn't matter which.)  We do not change the granted mode on locks without
3399 + * the CONVDEADLK flag.  If any of these exist (there shouldn't if the app uses
3400 + * the flag consistently) the false return value is used.
3401 + */
3402 +
3403 +static int conversion_deadlock_resolve(gd_res_t *rsb, gd_lkb_t *lkb)
3404 +{
3405 +       gd_lkb_t *this;
3406 +       int rv = TRUE;
3407 +
3408 +       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3409 +               if (this == lkb)
3410 +                       continue;
3411 +
3412 +               if (!ranges_overlap(lkb, this))
3413 +                       continue;
3414 +
3415 +               if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
3416 +
3417 +                       if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
3418 +                               rv = FALSE;
3419 +                               continue;
3420 +                       }
3421 +                       this->lkb_grmode = DLM_LOCK_NL;
3422 +                       this->lkb_flags |= GDLM_LKFLG_DEMOTED;
3423 +               }
3424 +       }
3425 +       return rv;
3426 +}
3427 +
3428 +/*
3429 + * "A conversion deadlock arises with a pair of lock requests in the converting
3430 + * queue for one resource.  The granted mode of each lock blocks the requested
3431 + * mode of the other lock."
3432 + */
3433 +
3434 +static int conversion_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
3435 +{
3436 +       gd_lkb_t *this;
3437 +
3438 +       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3439 +               if (this == lkb)
3440 +                       continue;
3441 +
3442 +               if (!ranges_overlap(lkb, this))
3443 +                       continue;
3444 +
3445 +               if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
3446 +                       return TRUE;
3447 +       }
3448 +       return FALSE;
3449 +}
3450 +
3451 +/*
3452 + * Check if the given lkb conflicts with another lkb on the queue.
3453 + */
3454 +
3455 +static int queue_conflict(struct list_head *head, gd_lkb_t *lkb)
3456 +{
3457 +       gd_lkb_t *this;
3458 +
3459 +       list_for_each_entry(this, head, lkb_statequeue) {
3460 +               if (this == lkb)
3461 +                       continue;
3462 +               if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3463 +                       return TRUE;
3464 +       }
3465 +       return FALSE;
3466 +}
3467 +
3468 +/*
3469 + * Deadlock can arise when using the QUECVT flag if the requested mode of the
3470 + * first converting lock is incompatible with the granted mode of another
3471 + * converting lock further down the queue.  To prevent this deadlock, a
3472 + * requested QUEUECVT lock is granted immediately if adding it to the end of
3473 + * the queue would prevent a lock ahead of it from being granted.
3474 + */
3475 +
3476 +static int queuecvt_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
3477 +{
3478 +       gd_lkb_t *this;
3479 +
3480 +       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3481 +               if (this == lkb)
3482 +                       break;
3483 +
3484 +               if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
3485 +                       return TRUE;
3486 +       }
3487 +       return FALSE;
3488 +}
3489 +
3490 +/*
3491 + * Return 1 if the lock can be granted, 0 otherwise.
3492 + * Also detect and resolve conversion deadlocks.
3493 + */
3494 +
3495 +static int can_be_granted(gd_res_t *rsb, gd_lkb_t *lkb)
3496 +{
3497 +       if (lkb->lkb_rqmode == DLM_LOCK_NL)
3498 +               return TRUE;
3499 +
3500 +       if (lkb->lkb_rqmode == lkb->lkb_grmode)
3501 +               return TRUE;
3502 +
3503 +       if (queue_conflict(&rsb->res_grantqueue, lkb))
3504 +               return FALSE;
3505 +
3506 +       if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
3507 +               if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
3508 +                       return TRUE;
3509 +
3510 +               if (list_empty(&rsb->res_convertqueue) ||
3511 +                   first_in_list(lkb, &rsb->res_convertqueue) ||
3512 +                   queuecvt_deadlock_detect(rsb, lkb))
3513 +                       return TRUE;
3514 +               else
3515 +                       return FALSE;
3516 +       }
3517 +
3518 +       /* there *is* a conflict between this lkb and a converting lock so
3519 +          we return false unless conversion deadlock resolution is permitted
3520 +          (only conversion requests will have the CONVDEADLK flag set) */
3521 +
3522 +       if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
3523 +               return FALSE;
3524 +
3525 +       if (!conversion_deadlock_detect(rsb, lkb))
3526 +               return FALSE;
3527 +
3528 +       if (conversion_deadlock_resolve(rsb, lkb))
3529 +               return TRUE;
3530 +
3531 +       return FALSE;
3532 +}
3533 +
3534 +int dlm_lock(void *lockspace,
3535 +            uint32_t mode,
3536 +            struct dlm_lksb *lksb,
3537 +            uint32_t flags,
3538 +            void *name,
3539 +            unsigned int namelen,
3540 +            uint32_t parent,
3541 +            void (*ast) (void *astarg),
3542 +            void *astarg,
3543 +            void (*bast) (void *astarg, int mode),
3544 +            struct dlm_range *range)
3545 +{
3546 +       gd_ls_t *lspace;
3547 +       gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
3548 +       int ret = -EINVAL;
3549 +
3550 +       lspace = find_lockspace_by_local_id(lockspace);
3551 +       if (!lspace)
3552 +               goto out;
3553 +
3554 +       if (mode < 0 || mode > DLM_LOCK_EX)
3555 +               goto out;
3556 +
3557 +       if (namelen > DLM_RESNAME_MAXLEN)
3558 +               goto out;
3559 +
3560 +       if (flags & DLM_LKF_CANCEL)
3561 +               goto out;
3562 +
3563 +       if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3564 +               goto out;
3565 +
3566 +       if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
3567 +               goto out;
3568 +
3569 +       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3570 +               goto out;
3571 +
3572 +       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3573 +               goto out;
3574 +
3575 +       if (!ast || !lksb)
3576 +               goto out;
3577 +
3578 +       if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
3579 +               goto out;
3580 +
3581 +       if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3582 +               goto out;
3583 +
3584 +       /*
3585 +        * Take conversion path.
3586 +        */
3587 +
3588 +       if (flags & DLM_LKF_CONVERT) {
3589 +               ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3590 +                                  bast, range);
3591 +               goto out;
3592 +       }
3593 +
3594 +       /*
3595 +        * Take new lock path.
3596 +        */
3597 +
3598 +       if (parent) {
3599 +               down_read(&lspace->ls_unlock_sem);
3600 +
3601 +               parent_lkb = find_lock_by_id(lspace, parent);
3602 +
3603 +               if (!parent_lkb ||
3604 +                   parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3605 +                   parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3606 +                   parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3607 +                       up_read(&lspace->ls_unlock_sem);
3608 +                       goto out;
3609 +               }
3610 +
3611 +               atomic_inc(&parent_lkb->lkb_childcnt);
3612 +               up_read(&lspace->ls_unlock_sem);
3613 +       }
3614 +
3615 +       down_read(&lspace->ls_in_recovery);
3616 +
3617 +       ret = -ENOMEM;
3618 +
3619 +       lkb = create_lkb(lspace);
3620 +       if (!lkb)
3621 +               goto fail_dec;
3622 +       lkb->lkb_astaddr = ast;
3623 +       lkb->lkb_astparam = (long) astarg;
3624 +       lkb->lkb_bastaddr = bast;
3625 +       lkb->lkb_rqmode = mode;
3626 +       lkb->lkb_grmode = DLM_LOCK_IV;
3627 +       lkb->lkb_lksb = lksb;
3628 +       lkb->lkb_parent = parent_lkb;
3629 +       lkb->lkb_lockqueue_flags = flags;
3630 +       lkb->lkb_lvbptr = lksb->sb_lvbptr;
3631 +
3632 +       /* Copy the range if appropriate */
3633 +       if (range) {
3634 +               if (range->ra_start > range->ra_end) {
3635 +                       ret = -EINVAL;
3636 +                       goto fail_free;
3637 +               }
3638 +
3639 +               if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3640 +                       goto fail_free;
3641 +       }
3642 +
3643 +       /* Convert relevant flags to internal numbers */
3644 +       if (flags & DLM_LKF_VALBLK)
3645 +               lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3646 +       if (flags & DLM_LKF_PERSISTENT)
3647 +               lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3648 +       if (flags & DLM_LKF_NODLCKWT)
3649 +               lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3650 +
3651 +       lksb->sb_lkid = lkb->lkb_id;
3652 +
3653 +       ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3654 +       if (ret)
3655 +               goto fail_free;
3656 +
3657 +       up_read(&lspace->ls_in_recovery);
3658 +
3659 +       wake_astd();
3660 +
3661 +       return 0;
3662 +
3663 +      fail_free:
3664 +       release_lkb(lspace, lkb);
3665 +       goto fail_unlock;
3666 +
3667 +      fail_dec:
3668 +       if (parent_lkb)
3669 +               atomic_dec(&parent_lkb->lkb_childcnt);
3670 +
3671 +      fail_unlock:
3672 +       up_read(&lspace->ls_in_recovery);
3673 +
3674 +      out:
3675 +       return ret;
3676 +}
3677 +
3678 +int dlm_lock_stage1(gd_ls_t *ls, gd_lkb_t *lkb, int flags, char *name,
3679 +                   int namelen)
3680 +{
3681 +       gd_res_t *rsb, *parent_rsb = NULL;
3682 +       gd_lkb_t *parent_lkb = lkb->lkb_parent;
3683 +       gd_resdata_t *rd;
3684 +       uint32_t nodeid;
3685 +       int error;
3686 +
3687 +       if (parent_lkb)
3688 +               parent_rsb = parent_lkb->lkb_resource;
3689 +
3690 +       error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
3691 +       if (error)
3692 +               goto out;
3693 +
3694 +       lkb->lkb_resource = rsb;
3695 +       lkb->lkb_nodeid = rsb->res_nodeid;
3696 +
3697 +       /*
3698 +        * Next stage, do we need to find the master or can
3699 +        * we get on with the real locking work ?
3700 +        */
3701 +
3702 +       if (rsb->res_nodeid == -1) {
3703 +               if (get_directory_nodeid(rsb) != our_nodeid()) {
3704 +                       error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3705 +                       goto out;
3706 +               }
3707 +
3708 +               error = get_resdata(ls, our_nodeid(), rsb->res_name,
3709 +                                   rsb->res_length, &rd, 0);
3710 +               if (error)
3711 +                       goto out;
3712 +
3713 +               nodeid = rd->rd_master_nodeid;
3714 +               if (nodeid == our_nodeid())
3715 +                       nodeid = 0;
3716 +               rsb->res_nodeid = nodeid;
3717 +               lkb->lkb_nodeid = nodeid;
3718 +               rsb->res_resdir_seq = rd->rd_sequence;
3719 +       }
3720 +
3721 +       error = dlm_lock_stage2(ls, lkb, rsb, flags);
3722 +
3723 +      out:
3724 +       if (error)
3725 +               release_rsb(rsb);
3726 +
3727 +       return error;
3728 +}
3729 +
3730 +/*
3731 + * Locking routine called after we have an RSB, either a copy of a remote one
3732 + * or a local one, or perhaps a shiny new one all of our very own
3733 + */
3734 +
3735 +int dlm_lock_stage2(gd_ls_t *ls, gd_lkb_t *lkb, gd_res_t *rsb, int flags)
3736 +{
3737 +       int error = 0;
3738 +
3739 +       if (rsb->res_nodeid) {
3740 +               res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3741 +               error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
3742 +       } else {
3743 +               dlm_lock_stage3(lkb);
3744 +       }
3745 +
3746 +       return error;
3747 +}
3748 +
3749 +/*
3750 + * Called on an RSB's master node to do stage2 locking for a remote lock
3751 + * request.  Returns a proper lkb with rsb ready for lock processing.
3752 + * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
3753 + */
3754 +
3755 +gd_lkb_t *remote_stage2(int remote_nodeid, gd_ls_t *ls,
3756 +                       struct gd_remlockrequest *freq)
3757 +{
3758 +       gd_res_t *rsb = NULL, *parent_rsb = NULL;
3759 +       gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
3760 +       int error, namelen;
3761 +
3762 +       if (freq->rr_remparid) {
3763 +               parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
3764 +               if (!parent_lkb)
3765 +                       goto fail;
3766 +
3767 +               atomic_inc(&parent_lkb->lkb_childcnt);
3768 +               parent_rsb = parent_lkb->lkb_resource;
3769 +       }
3770 +
3771 +       /*
3772 +        * A new MSTCPY lkb.  Initialize lkb fields including the real lkid and
3773 +        * node actually holding the (non-MSTCPY) lkb.  AST address are just
3774 +        * flags in the master copy.
3775 +        */
3776 +
3777 +       lkb = create_lkb(ls);
3778 +       if (!lkb)
3779 +               goto fail_dec;
3780 +       lkb->lkb_grmode = DLM_LOCK_IV;
3781 +       lkb->lkb_rqmode = freq->rr_rqmode;
3782 +       lkb->lkb_parent = parent_lkb;
3783 +       lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_COMPAST);
3784 +       lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_BLKAST);
3785 +       lkb->lkb_nodeid = remote_nodeid;
3786 +       lkb->lkb_remid = freq->rr_header.rh_lkid;
3787 +       lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
3788 +       lkb->lkb_lockqueue_flags = freq->rr_flags;
3789 +
3790 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
3791 +               lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3792 +               allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
3793 +               if (!lkb->lkb_lvbptr)
3794 +                       goto fail_free;
3795 +       }
3796 +
3797 +       if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
3798 +               error = lkb_set_range(ls, lkb, freq->rr_range_start,
3799 +                                     freq->rr_range_end);
3800 +               if (error)
3801 +                       goto fail_free;
3802 +       }
3803 +
3804 +       /*
3805 +        * Get the RSB which this lock is for.  Create a new RSB if this is a
3806 +        * new lock on a new resource.  We must be the master of any new rsb.
3807 +        */
3808 +
3809 +       namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
3810 +
3811 +       error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 1,
3812 +                                  &rsb);
3813 +       if (error)
3814 +               goto fail_free;
3815 +
3816 +       lkb->lkb_resource = rsb;
3817 +       if (rsb->res_nodeid == -1)
3818 +               rsb->res_nodeid = 0;
3819 +       if (freq->rr_resdir_seq)
3820 +               rsb->res_resdir_seq = freq->rr_resdir_seq;
3821 +
3822 +       return lkb;
3823 +
3824 +
3825 +      fail_free:
3826 +       /* release_lkb handles parent */
3827 +       release_lkb(ls, lkb);
3828 +       parent_lkb = NULL;
3829 +
3830 +      fail_dec:
3831 +       if (parent_lkb)
3832 +               atomic_dec(&parent_lkb->lkb_childcnt);
3833 +      fail:
3834 +       return NULL;
3835 +}
3836 +
3837 +/*
3838 + * The final bit of lock request processing on the master node.  Here the lock
3839 + * is granted and the completion ast is queued, or the lock is put on the
3840 + * waitqueue and blocking asts are sent.
3841 + */
3842 +
3843 +void dlm_lock_stage3(gd_lkb_t *lkb)
3844 +{
3845 +       gd_res_t *rsb = lkb->lkb_resource;
3846 +
3847 +       /*
3848 +        * This is a locally mastered lock on a resource that already exists,
3849 +        * see if it can be  granted or if it must wait.  When this function is
3850 +        * called for a remote lock request (process_cluster_request,
3851 +        * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
3852 +        * requesting node at the end of process_cluster_request, not at the
3853 +        * end of grant_lock.
3854 +        */
3855 +
3856 +       down_write(&rsb->res_lock);
3857 +
3858 +       if (can_be_granted(rsb, lkb)) {
3859 +               grant_lock(lkb, 0);
3860 +               goto out;
3861 +       }
3862 +
3863 +       /*
3864 +        * This request is not a conversion, so the lkb didn't exist other than
3865 +        * for this request and should be freed after EAGAIN is returned in the
3866 +        * ast.
3867 +        */
3868 +
3869 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
3870 +               lkb->lkb_flags |= GDLM_LKFLG_DELAST;
3871 +               lkb->lkb_retstatus = -EAGAIN;
3872 +               queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
3873 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
3874 +                       send_blocking_asts_all(rsb, lkb);
3875 +               goto out;
3876 +       }
3877 +
3878 +       /*
3879 +        * The requested lkb must wait.  Because the rsb of the requested lkb
3880 +        * is mastered here, send blocking asts for the lkb's blocking the
3881 +        * request.
3882 +        */
3883 +
3884 +       lkb->lkb_retstatus = 0;
3885 +       lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3886 +
3887 +       send_blocking_asts(rsb, lkb);
3888 +
3889 +      out:
3890 +       up_write(&rsb->res_lock);
3891 +}
3892 +
3893 +int dlm_unlock(void *lockspace,
3894 +              uint32_t lkid,
3895 +              uint32_t flags,
3896 +              struct dlm_lksb *lksb,
3897 +              void *astarg)
3898 +{
3899 +       gd_ls_t *ls = find_lockspace_by_local_id(lockspace);
3900 +       gd_lkb_t *lkb;
3901 +       gd_res_t *rsb;
3902 +       int ret = -EINVAL;
3903 +
3904 +       if (!ls)
3905 +               goto out;
3906 +
3907 +       lkb = find_lock_by_id(ls, lkid);
3908 +       if (!lkb)
3909 +               goto out;
3910 +
3911 +       /* Can't dequeue a master copy (a remote node's mastered lock) */
3912 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3913 +               goto out;
3914 +
3915 +       /* Already waiting for a remote lock operation */
3916 +       if (lkb->lkb_lockqueue_state) {
3917 +               ret = -EBUSY;
3918 +               goto out;
3919 +       }
3920 +
3921 +       /* Can only cancel WAITING or CONVERTing locks.
3922 +        * This is just a quick check - it is also checked in unlock_stage2()
3923 +        * (which may be on the master) under the semaphore.
3924 +        */
3925 +       if ((flags & DLM_LKF_CANCEL) &&
3926 +           (lkb->lkb_status == GDLM_LKSTS_GRANTED))
3927 +               goto out;
3928 +
3929 +       /* "Normal" unlocks must operate on a granted lock */
3930 +       if (!(flags & DLM_LKF_CANCEL) &&
3931 +           (lkb->lkb_status != GDLM_LKSTS_GRANTED))
3932 +               goto out;
3933 +
3934 +       down_write(&ls->ls_unlock_sem);
3935 +
3936 +       /* Can't dequeue a lock with sublocks */
3937 +       if (atomic_read(&lkb->lkb_childcnt)) {
3938 +               up_write(&ls->ls_unlock_sem);
3939 +               ret = -ENOTEMPTY;
3940 +               goto out;
3941 +       }
3942 +
3943 +       /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
3944 +       if (!(flags & DLM_LKF_CANCEL))
3945 +               lkb->lkb_flags |= GDLM_LKFLG_DELETED;
3946 +       up_write(&ls->ls_unlock_sem);
3947 +
3948 +       /* Save any new params */
3949 +       if (lksb)
3950 +               lkb->lkb_lksb = lksb;
3951 +       if (astarg)
3952 +               lkb->lkb_astparam = (long) astarg;
3953 +
3954 +       lkb->lkb_lockqueue_flags = flags;
3955 +
3956 +       rsb = lkb->lkb_resource;
3957 +
3958 +       down_read(&ls->ls_in_recovery);
3959 +
3960 +       if (rsb->res_nodeid)
3961 +               ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
3962 +       else
3963 +               ret = dlm_unlock_stage2(lkb, flags);
3964 +
3965 +       up_read(&ls->ls_in_recovery);
3966 +
3967 +       wake_astd();
3968 +
3969 +      out:
3970 +       return ret;
3971 +}
3972 +
3973 +int dlm_unlock_stage2(gd_lkb_t *lkb, uint32_t flags)
3974 +{
3975 +       gd_res_t *rsb = lkb->lkb_resource;
3976 +       int old_status;
3977 +       int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
3978 +
3979 +       down_write(&rsb->res_lock);
3980 +
3981 +       /* Can only cancel WAITING or CONVERTing locks */
3982 +       if ((flags & DLM_LKF_CANCEL) &&
3983 +           (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
3984 +               lkb->lkb_retstatus = -EINVAL;
3985 +               queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
3986 +               goto out;
3987 +       }
3988 +
3989 +       old_status = lkb_dequeue(lkb);
3990 +
3991 +       /*
3992 +        * If was granted grant any converting or waiting locks.
3993 +        */
3994 +
3995 +       if (old_status == GDLM_LKSTS_GRANTED)
3996 +               grant_pending_locks(rsb);
3997 +
3998 +       /*
3999 +        * Cancelling a conversion
4000 +        */
4001 +
4002 +       if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
4003 +               /* VMS semantics say we should send blocking ASTs again here */
4004 +               send_blocking_asts(rsb, lkb);
4005 +
4006 +               /* Remove from deadlock detection */
4007 +               if (lkb->lkb_duetime)
4008 +                       remove_from_deadlockqueue(lkb);
4009 +
4010 +               /* Stick it back on the granted queue */
4011 +               lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4012 +               lkb->lkb_rqmode = lkb->lkb_grmode;
4013 +
4014 +               /* Was it blocking any other locks? */
4015 +               if (first_in_list(lkb, &rsb->res_convertqueue))
4016 +                       grant_pending_locks(rsb);
4017 +
4018 +               lkb->lkb_retstatus = -DLM_ECANCEL;
4019 +               queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4020 +               goto out;
4021 +       }
4022 +
4023 +       /*
4024 +        * The lvb can be saved or cleared on unlock.
4025 +        */
4026 +
4027 +       if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
4028 +               if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
4029 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4030 +               if (flags & DLM_LKF_IVVALBLK)
4031 +                       memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
4032 +       }
4033 +
4034 +       lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4035 +       lkb->lkb_retstatus =
4036 +           (flags & DLM_LKF_CANCEL) ? -DLM_ECANCEL : -DLM_EUNLOCK;
4037 +       queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4038 +
4039 +       /*
4040 +        * Only free the LKB if we are the master copy.  Otherwise the AST
4041 +        * delivery routine will free it after delivery.  queue_ast for MSTCPY
4042 +        * lkb just sends a message.
4043 +        */
4044 +
4045 +       if (remote) {
4046 +               up_write(&rsb->res_lock);
4047 +               release_lkb(rsb->res_ls, lkb);
4048 +               release_rsb(rsb);
4049 +               goto out2;
4050 +       }
4051 +
4052 +      out:
4053 +       up_write(&rsb->res_lock);
4054 +      out2:
4055 +       wake_astd();
4056 +       return 0;
4057 +}
4058 +
4059 +/*
4060 + * Lock conversion
4061 + */
4062 +
4063 +static int convert_lock(gd_ls_t *ls, int mode, struct dlm_lksb *lksb,
4064 +                       int flags, void *ast, void *astarg, void *bast,
4065 +                       struct dlm_range *range)
4066 +{
4067 +       gd_lkb_t *lkb;
4068 +       gd_res_t *rsb;
4069 +       int ret = -EINVAL;
4070 +
4071 +       lkb = find_lock_by_id(ls, lksb->sb_lkid);
4072 +       if (!lkb) {
4073 +               goto out;
4074 +       }
4075 +
4076 +       if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
4077 +               ret = -EBUSY;
4078 +               goto out;
4079 +       }
4080 +
4081 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4082 +               goto out;
4083 +       }
4084 +
4085 +       if ((flags & DLM_LKF_QUECVT) &&
4086 +           !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
4087 +               goto out;
4088 +       }
4089 +
4090 +       if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
4091 +               goto out;
4092 +       }
4093 +
4094 +       if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
4095 +               goto out;
4096 +       }
4097 +
4098 +       /* Set up the ranges as appropriate */
4099 +       if (range) {
4100 +               if (range->ra_start > range->ra_end)
4101 +                       goto out;
4102 +
4103 +               if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4104 +                       ret = -ENOMEM;
4105 +                       goto out;
4106 +               }
4107 +       }
4108 +
4109 +       rsb = lkb->lkb_resource;
4110 +       down_read(&rsb->res_ls->ls_in_recovery);
4111 +
4112 +       lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4113 +       lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4114 +
4115 +       if (flags & DLM_LKF_NODLCKWT)
4116 +               lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
4117 +       if (ast)
4118 +               lkb->lkb_astaddr = ast;
4119 +       if (astarg)
4120 +               lkb->lkb_astparam = (long) astarg;
4121 +       if (bast)
4122 +               lkb->lkb_bastaddr = bast;
4123 +       lkb->lkb_rqmode = mode;
4124 +       lkb->lkb_lockqueue_flags = flags;
4125 +       lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4126 +       lkb->lkb_lvbptr = lksb->sb_lvbptr;
4127 +
4128 +       if (rsb->res_nodeid) {
4129 +               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4130 +               ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4131 +       } else {
4132 +               ret = dlm_convert_stage2(lkb, FALSE);
4133 +       }
4134 +
4135 +       up_read(&rsb->res_ls->ls_in_recovery);
4136 +
4137 +       wake_astd();
4138 +
4139 +      out:
4140 +       return ret;
4141 +}
4142 +
4143 +/*
4144 + * For local conversion requests on locally mastered locks this is called
4145 + * directly from dlm_lock/convert_lock.  This function is also called for
4146 + * remote conversion requests of MSTCPY locks (from process_cluster_request).
4147 + */
4148 +
4149 +int dlm_convert_stage2(gd_lkb_t *lkb, int do_ast)
4150 +{
4151 +       gd_res_t *rsb = lkb->lkb_resource;
4152 +       int ret = 0;
4153 +
4154 +       down_write(&rsb->res_lock);
4155 +
4156 +       if (can_be_granted(rsb, lkb)) {
4157 +               grant_lock(lkb, 0);
4158 +               grant_pending_locks(rsb);
4159 +               goto out;
4160 +       }
4161 +
4162 +       /*
4163 +        * Remove lkb from granted queue.
4164 +        */
4165 +
4166 +       lkb_dequeue(lkb);
4167 +
4168 +       /*
4169 +        * The user won't wait so stick it back on the grant queue
4170 +        */
4171 +
4172 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4173 +               lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4174 +               ret = lkb->lkb_retstatus = -EAGAIN;
4175 +               if (do_ast)
4176 +                       queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4177 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4178 +                       send_blocking_asts_all(rsb, lkb);
4179 +               goto out;
4180 +       }
4181 +
4182 +       /*
4183 +        * The lkb's status tells which queue it's on.  Put back on convert
4184 +        * queue.  (QUECVT requests added at end of the queue, all others in
4185 +        * order.)
4186 +        */
4187 +
4188 +       lkb->lkb_retstatus = 0;
4189 +       lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4190 +
4191 +       /*
4192 +        * If the request can't be granted
4193 +        */
4194 +
4195 +       send_blocking_asts(rsb, lkb);
4196 +
4197 +       if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4198 +               add_to_deadlockqueue(lkb);
4199 +
4200 +      out:
4201 +       up_write(&rsb->res_lock);
4202 +       return ret;
4203 +}
4204 +
4205 +/*
4206 + * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4207 + * completion ast.  rsb res_lock must be held in write when this is called.
4208 + */
4209 +
4210 +static void grant_lock(gd_lkb_t *lkb, int send_remote)
4211 +{
4212 +       gd_res_t *rsb = lkb->lkb_resource;
4213 +
4214 +       if (lkb->lkb_duetime)
4215 +               remove_from_deadlockqueue(lkb);
4216 +
4217 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4218 +               int b;
4219 +               GDLM_ASSERT(lkb->lkb_lvbptr,);
4220 +
4221 +               if (!rsb->res_lvbptr)
4222 +                       rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4223 +
4224 +               b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4225 +               if (b)
4226 +                       memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4227 +               else
4228 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4229 +       }
4230 +
4231 +       if (lkb->lkb_range) {
4232 +               lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4233 +               lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4234 +       }
4235 +
4236 +       lkb->lkb_grmode = lkb->lkb_rqmode;
4237 +       lkb->lkb_rqmode = DLM_LOCK_IV;
4238 +       lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4239 +
4240 +       lkb->lkb_highbast = 0;
4241 +       lkb->lkb_retstatus = 0;
4242 +       queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4243 +
4244 +       /*
4245 +        * A remote conversion request has been granted, either immediately
4246 +        * upon being requested or after waiting a bit.  In the former case,
4247 +        * reply_and_grant() is called.  In the later case send_remote is 1 and
4248 +        * remote_grant() is called.
4249 +        *
4250 +        * The "send_remote" flag is set only for locks which are granted "out
4251 +        * of band" - ie by another lock being converted or unlocked.
4252 +        *
4253 +        * The second case occurs when this lkb is granted right away as part
4254 +        * of processing the initial request.  In that case, we send a single
4255 +        * message in reply_and_grant which combines the request reply with the
4256 +        * grant message.
4257 +        */
4258 +
4259 +       if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4260 +               if (send_remote)
4261 +                       remote_grant(lkb);
4262 +               else if (lkb->lkb_request)
4263 +                       reply_and_grant(lkb);
4264 +       }
4265 +
4266 +}
4267 +
4268 +static void send_bast_queue(struct list_head *head, gd_lkb_t *lkb)
4269 +{
4270 +       gd_lkb_t *gr;
4271 +
4272 +       list_for_each_entry(gr, head, lkb_statequeue) {
4273 +               if (gr->lkb_bastaddr &&
4274 +                   gr->lkb_highbast < lkb->lkb_rqmode &&
4275 +                   ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
4276 +                       queue_ast(gr, GDLM_QUEUE_BLKAST, lkb->lkb_rqmode);
4277 +                       gr->lkb_highbast = lkb->lkb_rqmode;
4278 +               }
4279 +       }
4280 +}
4281 +
4282 +/*
4283 + * Notify granted locks if they are blocking a newly forced-to-wait lock.
4284 + */
4285 +
4286 +static void send_blocking_asts(gd_res_t *rsb, gd_lkb_t *lkb)
4287 +{
4288 +       send_bast_queue(&rsb->res_grantqueue, lkb);
4289 +       /* check if the following improves performance */
4290 +       /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4291 +}
4292 +
4293 +static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb)
4294 +{
4295 +       send_bast_queue(&rsb->res_grantqueue, lkb);
4296 +       send_bast_queue(&rsb->res_convertqueue, lkb);
4297 +}
4298 +
4299 +/*
4300 + * Called when a lock has been dequeued. Look for any locks to grant that are
4301 + * waiting for conversion or waiting to be granted.
4302 + * The rsb res_lock must be held in write when this function is called.
4303 + */
4304 +
4305 +int grant_pending_locks(gd_res_t *rsb)
4306 +{
4307 +       gd_lkb_t *lkb;
4308 +       struct list_head *list;
4309 +       struct list_head *temp;
4310 +       int8_t high = DLM_LOCK_IV;
4311 +
4312 +       list_for_each_safe(list, temp, &rsb->res_convertqueue) {
4313 +               lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4314 +
4315 +               if (can_be_granted(rsb, lkb))
4316 +                       grant_lock(lkb, 1);
4317 +               else
4318 +                       high = MAX(lkb->lkb_rqmode, high);
4319 +       }
4320 +
4321 +       list_for_each_safe(list, temp, &rsb->res_waitqueue) {
4322 +               lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4323 +
4324 +               if (can_be_granted(rsb, lkb))
4325 +                       grant_lock(lkb, 1);
4326 +               else
4327 +                       high = MAX(lkb->lkb_rqmode, high);
4328 +       }
4329 +
4330 +       /*
4331 +        * If there are locks left on the wait/convert queue then send blocking
4332 +        * ASTs to granted locks that are blocking
4333 +        *
4334 +        * FIXME: This might generate some spurious blocking ASTs for range
4335 +        * locks.
4336 +        */
4337 +
4338 +       if (high > DLM_LOCK_IV) {
4339 +               list_for_each_safe(list, temp, &rsb->res_grantqueue) {
4340 +                       lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4341 +
4342 +                       if (lkb->lkb_bastaddr &&
4343 +                           (lkb->lkb_highbast < high) &&
4344 +                           !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
4345 +
4346 +                               queue_ast(lkb, GDLM_QUEUE_BLKAST, high);
4347 +                               lkb->lkb_highbast = high;
4348 +                       }
4349 +               }
4350 +       }
4351 +
4352 +       return 0;
4353 +}
4354 +
4355 +/*
4356 + * Called to cancel a locking operation that failed due to some internal
4357 + * reason.
4358 + *
4359 + * Waiting locks will be removed, converting locks will be reverted to their
4360 + * granted status, unlocks will be left where they are.
4361 + *
4362 + * A completion AST will be delivered to the caller.
4363 + */
4364 +
4365 +int cancel_lockop(gd_lkb_t *lkb, int status)
4366 +{
4367 +       int state = lkb->lkb_lockqueue_state;
4368 +
4369 +       lkb->lkb_lockqueue_state = 0;
4370 +
4371 +       switch (state) {
4372 +       case GDLM_LQSTATE_WAIT_RSB:
4373 +               lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4374 +               break;
4375 +
4376 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4377 +               res_lkb_dequeue(lkb);
4378 +               lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4379 +               break;
4380 +
4381 +       case GDLM_LQSTATE_WAIT_CONVERT:
4382 +               res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4383 +
4384 +               /* Remove from deadlock detection */
4385 +               if (lkb->lkb_duetime) {
4386 +                       remove_from_deadlockqueue(lkb);
4387 +               }
4388 +               break;
4389 +
4390 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4391 +               /* We can leave this. I think.... */
4392 +               break;
4393 +       }
4394 +
4395 +       lkb->lkb_retstatus = status;
4396 +       queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4397 +
4398 +       return 0;
4399 +}
4400 +
4401 +/*
4402 + * Check for conversion deadlock. If a deadlock was found
4403 + * return lkb to kill, else return NULL
4404 + */
4405 +
4406 +gd_lkb_t *conversion_deadlock_check(gd_lkb_t *lkb)
4407 +{
4408 +       gd_res_t *rsb = lkb->lkb_resource;
4409 +       struct list_head *entry;
4410 +
4411 +       GDLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
4412 +
4413 +       /* Work our way up to the head of the queue looking for locks that
4414 +        * conflict with us */
4415 +
4416 +       down_read(&rsb->res_lock);
4417 +
4418 +       entry = lkb->lkb_statequeue.prev;
4419 +       while (entry != &rsb->res_convertqueue) {
4420 +               gd_lkb_t *lkb2 = list_entry(entry, gd_lkb_t, lkb_statequeue);
4421 +
4422 +               if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4423 +                       up_read(&rsb->res_lock);
4424 +                       return lkb;
4425 +               }
4426 +               entry = entry->prev;
4427 +       }
4428 +       up_read(&rsb->res_lock);
4429 +
4430 +       return 0;
4431 +}
4432 +
4433 +/*
4434 + * Conversion operation was cancelled by us (not the user).
4435 + * ret contains the return code to pass onto the user
4436 + */
4437 +
4438 +void cancel_conversion(gd_lkb_t *lkb, int ret)
4439 +{
4440 +       gd_res_t *rsb = lkb->lkb_resource;
4441 +
4442 +       /* Stick it back on the granted queue */
4443 +       res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4444 +       lkb->lkb_rqmode = lkb->lkb_grmode;
4445 +
4446 +       remove_from_deadlockqueue(lkb);
4447 +
4448 +       lkb->lkb_retstatus = ret;
4449 +       queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4450 +       wake_astd();
4451 +}
4452 +
4453 +/*
4454 + * As new master of the rsb for this lkb, we need to handle these requests
4455 + * removed from the lockqueue and originating from local processes:
4456 + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4457 + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4458 + */
4459 +
4460 +void process_remastered_lkb(gd_lkb_t *lkb, int state)
4461 +{
4462 +       switch (state) {
4463 +       case GDLM_LQSTATE_WAIT_RSB:
4464 +               dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4465 +                               lkb->lkb_lockqueue_flags,
4466 +                               lkb->lkb_resource->res_name,
4467 +                               lkb->lkb_resource->res_length);
4468 +               break;
4469 +
4470 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4471 +               res_lkb_dequeue(lkb);
4472 +               dlm_lock_stage3(lkb);
4473 +               break;
4474 +
4475 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4476 +               dlm_unlock_stage2(lkb, lkb->lkb_lockqueue_flags);
4477 +               break;
4478 +
4479 +       case GDLM_LQSTATE_WAIT_CONVERT:
4480 +               dlm_convert_stage2(lkb, TRUE);
4481 +               break;
4482 +
4483 +       default:
4484 +               GDLM_ASSERT(0,);
4485 +       }
4486 +}
4487 diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4488 --- linux-orig/cluster/dlm/locking.h    1970-01-01 07:30:00.000000000 +0730
4489 +++ linux-patched/cluster/dlm/locking.h 2004-06-25 18:31:07.000000000 +0800
4490 @@ -0,0 +1,33 @@
4491 +/******************************************************************************
4492 +*******************************************************************************
4493 +**
4494 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4495 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4496 +**
4497 +**  This copyrighted material is made available to anyone wishing to use,
4498 +**  modify, copy, or redistribute it subject to the terms and conditions
4499 +**  of the GNU General Public License v.2.
4500 +**
4501 +*******************************************************************************
4502 +******************************************************************************/
4503 +
4504 +#ifndef __LOCKING_DOT_H__
4505 +#define __LOCKING_DOT_H__
4506 +
4507 +void process_remastered_lkb(gd_lkb_t * lkb, int state);
4508 +void dlm_lock_stage3(gd_lkb_t * lkb);
4509 +int dlm_convert_stage2(gd_lkb_t * lkb, int do_ast);
4510 +int dlm_unlock_stage2(gd_lkb_t * lkb, uint32_t flags);
4511 +int dlm_lock_stage2(gd_ls_t * lspace, gd_lkb_t * lkb, gd_res_t * rsb,
4512 +                   int flags);
4513 +gd_res_t *create_rsb(gd_ls_t * lspace, gd_lkb_t * lkb, char *name, int namelen);
4514 +int free_rsb_if_unused(gd_res_t * rsb);
4515 +gd_lkb_t *remote_stage2(int remote_csid, gd_ls_t * lspace,
4516 +                       struct gd_remlockrequest *freq);
4517 +int cancel_lockop(gd_lkb_t * lkb, int status);
4518 +int dlm_remove_lock(gd_lkb_t * lkb, uint32_t flags);
4519 +int grant_pending_locks(gd_res_t * rsb);
4520 +void cancel_conversion(gd_lkb_t * lkb, int ret);
4521 +gd_lkb_t *conversion_deadlock_check(gd_lkb_t * lkb);
4522 +
4523 +#endif                         /* __LOCKING_DOT_H__ */
4524 diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4525 --- linux-orig/cluster/dlm/lockqueue.c  1970-01-01 07:30:00.000000000 +0730
4526 +++ linux-patched/cluster/dlm/lockqueue.c       2004-06-25 18:31:07.000000000 +0800
4527 @@ -0,0 +1,954 @@
4528 +/******************************************************************************
4529 +*******************************************************************************
4530 +**
4531 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4532 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4533 +**
4534 +**  This copyrighted material is made available to anyone wishing to use,
4535 +**  modify, copy, or redistribute it subject to the terms and conditions
4536 +**  of the GNU General Public License v.2.
4537 +**
4538 +*******************************************************************************
4539 +******************************************************************************/
4540 +
4541 +/*
4542 + * lockqueue.c
4543 + *
4544 + * This controls the lock queue, which is where locks
4545 + * come when they need to wait for a remote operation
4546 + * to complete.
4547 + *
4548 + * This could also be thought of as the "high-level" comms
4549 + * layer.
4550 + *
4551 + */
4552 +
4553 +#include "dlm_internal.h"
4554 +#include "lockqueue.h"
4555 +#include "dir.h"
4556 +#include "locking.h"
4557 +#include "lkb.h"
4558 +#include "lowcomms.h"
4559 +#include "midcomms.h"
4560 +#include "reccomms.h"
4561 +#include "nodes.h"
4562 +#include "lockspace.h"
4563 +#include "ast.h"
4564 +#include "memory.h"
4565 +#include "rsb.h"
4566 +#include "queries.h"
4567 +
4568 +static void add_reply_lvb(gd_lkb_t * lkb, struct gd_remlockreply *reply);
4569 +static void add_request_lvb(gd_lkb_t * lkb, struct gd_remlockrequest *req);
4570 +
4571 +/*
4572 + * format of an entry on the request queue
4573 + */
4574 +struct rq_entry {
4575 +       struct list_head rqe_list;
4576 +       uint32_t rqe_nodeid;
4577 +       char rqe_request[1];
4578 +};
4579 +
4580 +/*
4581 + * Add a new request (if appropriate) to the request queue and send the remote
4582 + * request out.  - runs in the context of the locking caller
4583 + *
4584 + * Recovery of a remote_stage request if the remote end fails while the lkb
4585 + * is still on the lockqueue:
4586 + *
4587 + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4588 + *   lockqueue_lkb_mark() at the start of recovery.
4589 + *
4590 + * o Some lkb's will be rebuilt on new master rsb's during recovery.
4591 + *   (depends on the type of request, see below).
4592 + *
4593 + * o At the end of recovery, resend_cluster_requests() looks at these
4594 + *   LQRESEND lkb's and either:
4595 + *
4596 + *   i) resends the request to the new master for the rsb where the
4597 + *      request is processed as usual.  The lkb remains on the lockqueue until
4598 + *      the new master replies and we run process_lockqueue_reply().
4599 + *
4600 + *   ii) if we've become the rsb master, remove the lkb from the lockqueue
4601 + *       and processes the request locally via process_remastered_lkb().
4602 + *
4603 + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4604 + * and the request should be resent if dest node is failed.
4605 + *
4606 + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4607 + * wait queue.  Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4608 + * makes send_lkb_queue() skip it).  Resend this request to the new master.
4609 + *
4610 + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue.  It will
4611 + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4612 + * Resend this request to the new master.
4613 + *
4614 + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4615 + * It will be rebuilt on the new master rsb's granted queue.  Resend this
4616 + * request to the new master.
4617 + */
4618 +
4619 +int remote_stage(gd_lkb_t *lkb, int state)
4620 +{
4621 +       int error;
4622 +
4623 +       lkb->lkb_lockqueue_state = state;
4624 +       add_to_lockqueue(lkb);
4625 +
4626 +       error = send_cluster_request(lkb, state);
4627 +       if (error < 0) {
4628 +               log_print("remote_stage error sending request %d", error);
4629 +
4630 +               /* Leave on lockqueue, it will be resent to correct node during
4631 +                * recovery. */
4632 +
4633 +                /*
4634 +                lkb->lkb_lockqueue_state = 0;
4635 +                remove_from_lockqueue(lkb);
4636 +                return -ENOTCONN;
4637 +                */
4638 +       }
4639 +       return 0;
4640 +}
4641 +
4642 +/*
4643 + * Requests received while the lockspace is in recovery get added to the
4644 + * request queue and processed when recovery is complete.
4645 + */
4646 +
4647 +void add_to_requestqueue(gd_ls_t *ls, int nodeid, char *request, int length)
4648 +{
4649 +       struct rq_entry *entry;
4650 +
4651 +       if (in_nodes_gone(ls, nodeid))
4652 +               return;
4653 +
4654 +       entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
4655 +       if (!entry) {
4656 +               // TODO something better
4657 +               printk("dlm: add_to_requestqueue: out of memory\n");
4658 +               return;
4659 +       }
4660 +
4661 +       log_debug(ls, "add_to_requestqueue %d", nodeid);
4662 +       entry->rqe_nodeid = nodeid;
4663 +       memcpy(entry->rqe_request, request, length);
4664 +       list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
4665 +}
4666 +
4667 +int process_requestqueue(gd_ls_t *ls)
4668 +{
4669 +       int error = 0, count = 0;
4670 +       struct rq_entry *entry, *safe;
4671 +       struct gd_req_header *req;
4672 +
4673 +       log_all(ls, "process held requests");
4674 +
4675 +       list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4676 +               req = (struct gd_req_header *) entry->rqe_request;
4677 +               log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
4678 +
4679 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
4680 +                       log_debug(ls, "process_requestqueue aborted");
4681 +                       error = -EINTR;
4682 +                       break;
4683 +               }
4684 +
4685 +               error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
4686 +               if (error == -EINTR) {
4687 +                       log_debug(ls, "process_requestqueue interrupted");
4688 +                       break;
4689 +               }
4690 +
4691 +               list_del(&entry->rqe_list);
4692 +               kfree(entry);
4693 +               count++;
4694 +               error = 0;
4695 +       }
4696 +
4697 +       log_all(ls, "processed %d requests", count);
4698 +       return error;
4699 +}
4700 +
4701 +void wait_requestqueue(gd_ls_t *ls)
4702 +{
4703 +       while (!list_empty(&ls->ls_requestqueue) &&
4704 +               test_bit(LSFL_LS_RUN, &ls->ls_flags))
4705 +               schedule();
4706 +}
4707 +
4708 +/*
4709 + * Resdir requests (lookup or remove) and replies from before recovery are
4710 + * invalid since the resdir was rebuilt.  Clear them.  Requests from nodes now
4711 + * gone are also invalid.
4712 + */
4713 +
4714 +void purge_requestqueue(gd_ls_t *ls)
4715 +{
4716 +       int count = 0;
4717 +       struct rq_entry *entry, *safe;
4718 +       struct gd_req_header *req;
4719 +       struct gd_remlockrequest *freq;
4720 +       gd_lkb_t *lkb;
4721 +
4722 +       log_all(ls, "purge requests");
4723 +
4724 +       list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4725 +               req = (struct gd_req_header *) entry->rqe_request;
4726 +               freq = (struct gd_remlockrequest *) req;
4727 +
4728 +               if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
4729 +                   req->rh_cmd == GDLM_REMCMD_LOOKUP ||
4730 +                   in_nodes_gone(ls, entry->rqe_nodeid)) {
4731 +
4732 +                       list_del(&entry->rqe_list);
4733 +                       kfree(entry);
4734 +                       count++;
4735 +
4736 +               } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
4737 +
4738 +                       /*
4739 +                        * Replies to resdir lookups are invalid and must be
4740 +                        * purged.  The lookup requests are marked in
4741 +                        * lockqueue_lkb_mark and will be resent in
4742 +                        * resend_cluster_requests.  The only way to check if
4743 +                        * this is a lookup reply is to look at the
4744 +                        * lockqueue_state of the lkb.
4745 +                        */
4746 +
4747 +                       lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
4748 +                       GDLM_ASSERT(lkb,);
4749 +                       if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
4750 +                               list_del(&entry->rqe_list);
4751 +                               kfree(entry);
4752 +                               count++;
4753 +                       }
4754 +               }
4755 +       }
4756 +
4757 +       log_all(ls, "purged %d requests", count);
4758 +}
4759 +
4760 +/*
4761 + * Check if there's a reply for the given lkid in the requestqueue.
4762 + */
4763 +
4764 +int reply_in_requestqueue(gd_ls_t *ls, int lkid)
4765 +{
4766 +       int rv = FALSE;
4767 +       struct rq_entry *entry, *safe;
4768 +       struct gd_req_header *req;
4769 +       struct gd_remlockrequest *freq;
4770 +
4771 +       list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4772 +               req = (struct gd_req_header *) entry->rqe_request;
4773 +               freq = (struct gd_remlockrequest *) req;
4774 +
4775 +               if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
4776 +                   freq->rr_header.rh_lkid == lkid) {
4777 +                       rv = TRUE;
4778 +                       break;
4779 +               }
4780 +       }
4781 +
4782 +       return rv;
4783 +}
4784 +
4785 +void allocate_and_copy_lvb(gd_ls_t *ls, char **lvbptr, char *src)
4786 +{
4787 +       if (!*lvbptr)
4788 +               *lvbptr = allocate_lvb(ls);
4789 +       if (*lvbptr)
4790 +               memcpy(*lvbptr, src, DLM_LVB_LEN);
4791 +}
4792 +
4793 +/*
4794 + * Process a lockqueue LKB after it has had it's remote processing complete and
4795 + * been pulled from the lockqueue.  Runs in the context of the DLM recvd thread on
4796 + * the machine that requested the lock.
4797 + */
4798 +
4799 +static void process_lockqueue_reply(gd_lkb_t *lkb,
4800 +                                   struct gd_remlockreply *reply)
4801 +{
4802 +       int state = lkb->lkb_lockqueue_state;
4803 +       int oldstate;
4804 +       gd_res_t *rsb = lkb->lkb_resource;
4805 +       gd_ls_t *ls = rsb->res_ls;
4806 +
4807 +       lkb->lkb_lockqueue_state = 0;
4808 +       if (state)
4809 +               remove_from_lockqueue(lkb);
4810 +
4811 +       switch (state) {
4812 +       case GDLM_LQSTATE_WAIT_RSB:
4813 +
4814 +               GDLM_ASSERT(reply->rl_status == 0,);
4815 +
4816 +               if (reply->rl_nodeid == our_nodeid())
4817 +                       rsb->res_nodeid = 0;
4818 +               else
4819 +                       rsb->res_nodeid = reply->rl_nodeid;
4820 +
4821 +               rsb->res_resdir_seq = reply->rl_resdir_seq;
4822 +               lkb->lkb_nodeid = rsb->res_nodeid;
4823 +
4824 +               dlm_lock_stage2(rsb->res_ls, lkb, rsb,
4825 +                               lkb->lkb_lockqueue_flags);
4826 +               break;
4827 +
4828 +       case GDLM_LQSTATE_WAIT_CONVERT:
4829 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4830 +
4831 +               /*
4832 +                * After a remote lock/conversion/grant request we put the lock
4833 +                * on the right queue and send an AST if appropriate.  Any lock
4834 +                * shuffling (eg newly granted locks because this one was
4835 +                * converted downwards) will be dealt with in seperate messages
4836 +                * (which may be in the same network message)
4837 +                */
4838 +
4839 +               if (!lkb->lkb_remid)
4840 +                       lkb->lkb_remid = reply->rl_lkid;
4841 +
4842 +               /*
4843 +                * The remote request failed (we assume because of NOQUEUE).
4844 +                * If this is a new request (non-conv) the lkb was created just
4845 +                * for it so the lkb should be freed.  If this was a
4846 +                * conversion, the lkb already existed so we should put it back
4847 +                * on the grant queue.
4848 +                */
4849 +
4850 +               if (reply->rl_status != 0) {
4851 +                       GDLM_ASSERT(reply->rl_status == -EAGAIN,);
4852 +
4853 +                       if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
4854 +                               res_lkb_dequeue(lkb);
4855 +                               lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4856 +                       } else
4857 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4858 +
4859 +                       lkb->lkb_retstatus = reply->rl_status;
4860 +                       queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4861 +                       break;
4862 +               }
4863 +
4864 +               /*
4865 +                * The remote request was successful in granting the request or
4866 +                * queuing it to be granted later.  Add the lkb to the
4867 +                * appropriate rsb queue.
4868 +                */
4869 +
4870 +               switch (reply->rl_lockstate) {
4871 +               case GDLM_LKSTS_GRANTED:
4872 +
4873 +                       /* Compact version of grant_lock(). */
4874 +
4875 +                       down_write(&rsb->res_lock);
4876 +                       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
4877 +                               memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
4878 +                                      DLM_LVB_LEN);
4879 +
4880 +                       lkb->lkb_grmode = lkb->lkb_rqmode;
4881 +                       lkb->lkb_rqmode = DLM_LOCK_IV;
4882 +                       lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4883 +
4884 +                       if (lkb->lkb_range) {
4885 +                               lkb->lkb_range[GR_RANGE_START] =
4886 +                                   lkb->lkb_range[RQ_RANGE_START];
4887 +                               lkb->lkb_range[GR_RANGE_END] =
4888 +                                   lkb->lkb_range[RQ_RANGE_END];
4889 +                       }
4890 +                       up_write(&rsb->res_lock);
4891 +
4892 +                       lkb->lkb_retstatus = 0;
4893 +                       queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4894 +                       break;
4895 +
4896 +               case GDLM_LKSTS_WAITING:
4897 +
4898 +                       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4899 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4900 +                       else
4901 +                               log_error(ls, "wait reply for granted %x %u",
4902 +                                         lkb->lkb_id, lkb->lkb_nodeid);
4903 +                       break;
4904 +
4905 +               case GDLM_LKSTS_CONVERT:
4906 +
4907 +                       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4908 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4909 +                       else
4910 +                               log_error(ls, "convert reply for granted %x %u",
4911 +                                         lkb->lkb_id, lkb->lkb_nodeid);
4912 +                       break;
4913 +
4914 +               default:
4915 +                       log_error(ls, "process_lockqueue_reply state %d",
4916 +                                 reply->rl_lockstate);
4917 +               }
4918 +
4919 +               break;
4920 +
4921 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4922 +
4923 +               /*
4924 +                * Unlocks should never fail.  Update local lock info.  This
4925 +                * always sends completion AST with status in lksb
4926 +                */
4927 +
4928 +               GDLM_ASSERT(reply->rl_status == 0,);
4929 +               oldstate = res_lkb_dequeue(lkb);
4930 +
4931 +               /* Differentiate between unlocks and conversion cancellations */
4932 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
4933 +                   oldstate == GDLM_LKSTS_CONVERT) {
4934 +                       res_lkb_enqueue(lkb->lkb_resource, lkb,
4935 +                                       GDLM_LKSTS_GRANTED);
4936 +                       lkb->lkb_retstatus = -DLM_ECANCEL;
4937 +               } else {
4938 +                       lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4939 +                       lkb->lkb_retstatus = -DLM_EUNLOCK;
4940 +               }
4941 +               queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4942 +               break;
4943 +
4944 +       default:
4945 +               log_error(ls, "process_lockqueue_reply id %x state %d",
4946 +                         lkb->lkb_id, state);
4947 +       }
4948 +}
4949 +
4950 +/*
4951 + * Tell a remote node to grant a lock.  This happens when we are the master
4952 + * copy for a lock that is actually held on a remote node.  The remote end is
4953 + * also responsible for sending the completion AST.
4954 + */
4955 +
4956 +void remote_grant(gd_lkb_t *lkb)
4957 +{
4958 +       struct writequeue_entry *e;
4959 +       struct gd_remlockrequest *req;
4960 +
4961 +       // TODO Error handling
4962 +       e = lowcomms_get_buffer(lkb->lkb_nodeid,
4963 +                               sizeof(struct gd_remlockrequest),
4964 +                               lkb->lkb_resource->res_ls->ls_allocation,
4965 +                               (char **) &req);
4966 +       if (!e)
4967 +               return;
4968 +
4969 +       req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
4970 +       req->rr_header.rh_length = sizeof(struct gd_remlockrequest);
4971 +       req->rr_header.rh_flags = 0;
4972 +       req->rr_header.rh_lkid = lkb->lkb_id;
4973 +       req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
4974 +       req->rr_remlkid = lkb->lkb_remid;
4975 +       req->rr_flags = 0;
4976 +
4977 +       if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
4978 +               /* This is a confusing non-standard use of rr_flags which is
4979 +                * usually used to pass lockqueue_flags. */
4980 +               req->rr_flags |= GDLM_LKFLG_DEMOTED;
4981 +       }
4982 +
4983 +       add_request_lvb(lkb, req);
4984 +       midcomms_send_buffer(&req->rr_header, e);
4985 +}
4986 +
4987 +void reply_and_grant(gd_lkb_t *lkb)
4988 +{
4989 +       struct gd_remlockrequest *req = lkb->lkb_request;
4990 +       struct gd_remlockreply *reply;
4991 +       struct writequeue_entry *e;
4992 +
4993 +       // TODO Error handling
4994 +       e = lowcomms_get_buffer(lkb->lkb_nodeid,
4995 +                               sizeof(struct gd_remlockreply),
4996 +                               lkb->lkb_resource->res_ls->ls_allocation,
4997 +                               (char **) &reply);
4998 +       if (!e)
4999 +               return;
5000 +
5001 +       reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5002 +       reply->rl_header.rh_flags = 0;
5003 +       reply->rl_header.rh_length = sizeof(struct gd_remlockreply);
5004 +       reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
5005 +       reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
5006 +
5007 +       reply->rl_status = lkb->lkb_retstatus;
5008 +       reply->rl_lockstate = lkb->lkb_status;
5009 +       reply->rl_lkid = lkb->lkb_id;
5010 +
5011 +       GDLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
5012 +
5013 +       lkb->lkb_request = NULL;
5014 +
5015 +       add_reply_lvb(lkb, reply);
5016 +       midcomms_send_buffer(&reply->rl_header, e);
5017 +}
5018 +
5019 +/*
5020 + * Request removal of a dead entry in the resource directory
5021 + */
5022 +
5023 +void remote_remove_resdata(gd_ls_t *ls, int nodeid, char *name, int namelen,
5024 +                          uint8_t sequence)
5025 +{
5026 +       struct writequeue_entry *e;
5027 +       struct gd_remlockrequest *req;
5028 +
5029 +       if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5030 +               gd_rcom_t *rc = allocate_rcom_buffer(ls);
5031 +
5032 +               memcpy(rc->rc_buf, name, namelen);
5033 +               rc->rc_datalen = namelen;
5034 +
5035 +               rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5036 +
5037 +               free_rcom_buffer(rc);
5038 +               return;
5039 +       }
5040 +       // TODO Error handling
5041 +       e = lowcomms_get_buffer(nodeid,
5042 +                               sizeof(struct gd_remlockrequest) + namelen - 1,
5043 +                               ls->ls_allocation, (char **) &req);
5044 +       if (!e)
5045 +               return;
5046 +
5047 +       memset(req, 0, sizeof(struct gd_remlockrequest) + namelen - 1);
5048 +       req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5049 +       req->rr_header.rh_length =
5050 +           sizeof(struct gd_remlockrequest) + namelen - 1;
5051 +       req->rr_header.rh_flags = 0;
5052 +       req->rr_header.rh_lkid = 0;
5053 +       req->rr_header.rh_lockspace = ls->ls_global_id;
5054 +       req->rr_remlkid = 0;
5055 +       req->rr_resdir_seq = sequence;
5056 +       memcpy(req->rr_name, name, namelen);
5057 +
5058 +       midcomms_send_buffer(&req->rr_header, e);
5059 +}
5060 +
5061 +/*
5062 + * Send remote cluster request to directory or master node before the request
5063 + * is put on the lock queue.  Runs in the context of the locking caller.
5064 + */
5065 +
5066 +int send_cluster_request(gd_lkb_t *lkb, int state)
5067 +{
5068 +       uint32_t target_nodeid;
5069 +       gd_res_t *rsb = lkb->lkb_resource;
5070 +       gd_ls_t *ls = rsb->res_ls;
5071 +       struct gd_remlockrequest *req;
5072 +       struct writequeue_entry *e;
5073 +
5074 +       /* Need to know the target nodeid before we allocate a send buffer */
5075 +       target_nodeid = lkb->lkb_nodeid;
5076 +       GDLM_ASSERT(target_nodeid != 0,);
5077 +
5078 +       if (state == GDLM_LQSTATE_WAIT_RSB)
5079 +               target_nodeid = get_directory_nodeid(rsb);
5080 +
5081 +       GDLM_ASSERT(target_nodeid,);
5082 +
5083 +       if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5084 +               /* this may happen when called by resend_cluster_request */
5085 +               log_error(ls, "send_cluster_request to %u state %d recovery",
5086 +                         target_nodeid, state);
5087 +       }
5088 +
5089 +       e = lowcomms_get_buffer(target_nodeid,
5090 +                               sizeof(struct gd_remlockrequest) +
5091 +                               rsb->res_length - 1, ls->ls_allocation,
5092 +                               (char **) &req);
5093 +       if (!e)
5094 +               return -ENOBUFS;
5095 +       memset(req, 0, sizeof(struct gd_remlockrequest) + rsb->res_length - 1);
5096 +
5097 +       /* Common stuff, some are just defaults */
5098 +
5099 +       if (lkb->lkb_bastaddr)
5100 +               req->rr_asts = GDLM_QUEUE_BLKAST;
5101 +       if (lkb->lkb_astaddr)
5102 +               req->rr_asts |= GDLM_QUEUE_COMPAST;
5103 +       if (lkb->lkb_parent)
5104 +               req->rr_remparid = lkb->lkb_parent->lkb_remid;
5105 +
5106 +       req->rr_flags = lkb->lkb_lockqueue_flags;
5107 +       req->rr_rqmode = lkb->lkb_rqmode;
5108 +       req->rr_remlkid = lkb->lkb_remid;
5109 +       req->rr_header.rh_length =
5110 +           sizeof(struct gd_remlockrequest) + rsb->res_length - 1;
5111 +       req->rr_header.rh_flags = 0;
5112 +       req->rr_header.rh_lkid = lkb->lkb_id;
5113 +       req->rr_header.rh_lockspace = ls->ls_global_id;
5114 +
5115 +       switch (state) {
5116 +
5117 +       case GDLM_LQSTATE_WAIT_RSB:
5118 +
5119 +               /* The lock must be a root lock */
5120 +               GDLM_ASSERT(!lkb->lkb_parent,);
5121 +
5122 +               req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5123 +               memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5124 +               break;
5125 +
5126 +       case GDLM_LQSTATE_WAIT_CONVERT:
5127 +
5128 +               req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5129 +               if (lkb->lkb_range) {
5130 +                       req->rr_flags |= GDLM_LKFLG_RANGE;
5131 +                       req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5132 +                       req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5133 +               }
5134 +               break;
5135 +
5136 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
5137 +
5138 +               req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
5139 +               req->rr_resdir_seq = rsb->res_resdir_seq;
5140 +               memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5141 +               if (lkb->lkb_range) {
5142 +                       req->rr_flags |= GDLM_LKFLG_RANGE;
5143 +                       req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5144 +                       req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5145 +               }
5146 +               break;
5147 +
5148 +       case GDLM_LQSTATE_WAIT_UNLOCK:
5149 +
5150 +               req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5151 +               break;
5152 +
5153 +       default:
5154 +               GDLM_ASSERT(!"Unknown cluster request",);
5155 +       }
5156 +
5157 +       add_request_lvb(lkb, req);
5158 +       midcomms_send_buffer(&req->rr_header, e);
5159 +
5160 +       return 0;
5161 +}
5162 +
5163 +/*
5164 + * We got a request from another cluster node, process it and return an info
5165 + * structure with the lock state/LVB etc as required.  Executes in the DLM's
5166 + * recvd thread.
5167 + */
5168 +
5169 +int process_cluster_request(int nodeid, struct gd_req_header *req, int recovery)
5170 +{
5171 +       gd_ls_t *lspace;
5172 +       gd_lkb_t *lkb = NULL;
5173 +       gd_res_t *rsb;
5174 +       int send_reply = 0, status = 0, namelen;
5175 +       struct gd_remlockrequest *freq = (struct gd_remlockrequest *) req;
5176 +       struct gd_remlockreply reply;
5177 +
5178 +       lspace = find_lockspace_by_global_id(req->rh_lockspace);
5179 +
5180 +       if (!lspace) {
5181 +               log_print("process_cluster_request invalid lockspace %x "
5182 +                         "from %d req %u", req->rh_lockspace, nodeid,
5183 +                         req->rh_cmd);
5184 +               status = -EINVAL;
5185 +               goto out;
5186 +       }
5187 +
5188 +       /* wait for recoverd to drain requestqueue */
5189 +       if (!recovery)
5190 +               wait_requestqueue(lspace);
5191 +
5192 +       /*
5193 +        * If we're in recovery then queue the request for later.  Otherwise,
5194 +        * we still need to get the "in_recovery" lock to make sure the
5195 +        * recovery itself doesn't start until we are done.
5196 +        */
5197 + retry:
5198 +       if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
5199 +               if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
5200 +                       log_error(lspace, "process_cluster_request warning %u",
5201 +                                 nodeid);
5202 +               add_to_requestqueue(lspace, nodeid, (char *) req,
5203 +                                   req->rh_length);
5204 +               log_debug(lspace, "process_cluster_request abort");
5205 +               status = -EINTR;
5206 +               goto out;
5207 +       }
5208 +       if (!down_read_trylock(&lspace->ls_in_recovery)) {
5209 +               schedule();
5210 +               goto retry;
5211 +       }
5212 +
5213 +
5214 +       /*
5215 +        * Process the request.
5216 +        */
5217 +
5218 +       switch (req->rh_cmd) {
5219 +
5220 +       case GDLM_REMCMD_LOOKUP:
5221 +               {
5222 +                       gd_resdata_t *rd;
5223 +                       int status;
5224 +                       uint32_t dir_nodeid;
5225 +
5226 +                       namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5227 +
5228 +                       dir_nodeid = name_to_directory_nodeid(lspace,
5229 +                                                             freq->rr_name,
5230 +                                                             namelen);
5231 +                       if (dir_nodeid != our_nodeid())
5232 +                               log_debug(lspace, "ignoring directory lookup");
5233 +
5234 +                       status = get_resdata(lspace, nodeid, freq->rr_name,
5235 +                                            namelen, &rd, 0);
5236 +                       if (status)
5237 +                               status = -ENOMEM;
5238 +
5239 +                       reply.rl_status = status;
5240 +                       reply.rl_lockstate = 0;
5241 +                       reply.rl_nodeid = rd->rd_master_nodeid;
5242 +                       reply.rl_resdir_seq = rd->rd_sequence;
5243 +               }
5244 +               send_reply = 1;
5245 +               break;
5246 +
5247 +       case GDLM_REMCMD_REM_RESDATA:
5248 +
5249 +               namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5250 +               remove_resdata(lspace, nodeid, freq->rr_name, namelen,
5251 +                              freq->rr_resdir_seq);
5252 +               break;
5253 +
5254 +       case GDLM_REMCMD_LOCKREQUEST:
5255 +
5256 +               lkb = remote_stage2(nodeid, lspace, freq);
5257 +               if (lkb) {
5258 +                       lkb->lkb_request = freq;
5259 +                       dlm_lock_stage3(lkb);
5260 +
5261 +                       /*
5262 +                        * If the request was granted in lock_stage3, then a
5263 +                        * reply message was already sent in combination with
5264 +                        * the grant message and lkb_request is NULL.
5265 +                        */
5266 +
5267 +                       if (lkb->lkb_request) {
5268 +                               lkb->lkb_request = NULL;
5269 +                               send_reply = 1;
5270 +                               reply.rl_status = lkb->lkb_retstatus;
5271 +                               reply.rl_lockstate = lkb->lkb_status;
5272 +                               reply.rl_lkid = lkb->lkb_id;
5273 +
5274 +                               /*
5275 +                                * If the request could not be granted and the
5276 +                                * user won't wait, then free up the LKB
5277 +                                */
5278 +
5279 +                               if (lkb->lkb_flags & GDLM_LKFLG_DELAST) {
5280 +                                       rsb = lkb->lkb_resource;
5281 +                                       release_lkb(lspace, lkb);
5282 +                                       release_rsb(rsb);
5283 +                                       lkb = NULL;
5284 +                               }
5285 +                       }
5286 +               } else {
5287 +                       reply.rl_status = -ENOMEM;
5288 +                       send_reply = 1;
5289 +               }
5290 +               break;
5291 +
5292 +       case GDLM_REMCMD_CONVREQUEST:
5293 +
5294 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5295 +
5296 +               GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5297 +                                       freq->rr_remlkid,
5298 +                                       freq->rr_header.rh_lkid, nodeid););
5299 +
5300 +               if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5301 +                       log_error(lspace, "convrequest: invalid status %d",
5302 +                                 lkb->lkb_status);
5303 +
5304 +               lkb->lkb_rqmode = freq->rr_rqmode;
5305 +               lkb->lkb_lockqueue_flags = freq->rr_flags;
5306 +               lkb->lkb_request = freq;
5307 +               lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5308 +
5309 +               if (lkb->lkb_flags & GDLM_LKFLG_VALBLK
5310 +                   || freq->rr_flags & DLM_LKF_VALBLK) {
5311 +                       lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5312 +                       allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5313 +                                             freq->rr_lvb);
5314 +               }
5315 +
5316 +               if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5317 +                       if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5318 +                                         freq->rr_range_end)) {
5319 +                               reply.rl_status = -ENOMEM;
5320 +                               send_reply = 1;
5321 +                               goto out;
5322 +                       }
5323 +               }
5324 +
5325 +               dlm_convert_stage2(lkb, FALSE);
5326 +
5327 +               /*
5328 +                * If the conv request was granted in stage2, then a reply
5329 +                * message was already sent in combination with the grant
5330 +                * message.
5331 +                */
5332 +
5333 +               if (lkb->lkb_request) {
5334 +                       lkb->lkb_request = NULL;
5335 +                       send_reply = 1;
5336 +                       reply.rl_status = lkb->lkb_retstatus;
5337 +                       reply.rl_lockstate = lkb->lkb_status;
5338 +                       reply.rl_lkid = lkb->lkb_id;
5339 +               }
5340 +               break;
5341 +
5342 +       case GDLM_REMCMD_LOCKREPLY:
5343 +
5344 +               lkb = find_lock_by_id(lspace, freq->rr_header.rh_lkid);
5345 +
5346 +               GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5347 +                                       freq->rr_remlkid,
5348 +                                       freq->rr_header.rh_lkid, nodeid););
5349 +
5350 +               process_lockqueue_reply(lkb, (struct gd_remlockreply *) req);
5351 +               break;
5352 +
5353 +       case GDLM_REMCMD_LOCKGRANT:
5354 +
5355 +               /*
5356 +                * Remote lock has been granted asynchronously.  Do a compact
5357 +                * version of what grant_lock() does.
5358 +                */
5359 +
5360 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5361 +
5362 +               GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5363 +                                       freq->rr_remlkid,
5364 +                                       freq->rr_header.rh_lkid, nodeid););
5365 +
5366 +               rsb = lkb->lkb_resource;
5367 +
5368 +               if (lkb->lkb_lockqueue_state)
5369 +                       log_error(rsb->res_ls, "granting lock on lockqueue "
5370 +                                 "id=%x from=%u lqstate=%d flags=%x",
5371 +                                 lkb->lkb_id, nodeid, lkb->lkb_lockqueue_state,
5372 +                                 lkb->lkb_flags);
5373 +
5374 +               down_write(&rsb->res_lock);
5375 +
5376 +               if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5377 +                       memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
5378 +
5379 +               lkb->lkb_grmode = lkb->lkb_rqmode;
5380 +               lkb->lkb_rqmode = DLM_LOCK_IV;
5381 +
5382 +               if (lkb->lkb_range) {
5383 +                       lkb->lkb_range[GR_RANGE_START] =
5384 +                           lkb->lkb_range[RQ_RANGE_START];
5385 +                       lkb->lkb_range[GR_RANGE_END] =
5386 +                           lkb->lkb_range[RQ_RANGE_END];
5387 +               }
5388 +
5389 +               lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5390 +               up_write(&rsb->res_lock);
5391 +
5392 +               if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5393 +                       lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5394 +
5395 +               lkb->lkb_retstatus = 0;
5396 +               queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
5397 +               break;
5398 +
5399 +       case GDLM_REMCMD_SENDBAST:
5400 +
5401 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5402 +
5403 +               GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5404 +                                       freq->rr_remlkid,
5405 +                                       freq->rr_header.rh_lkid, nodeid););
5406 +
5407 +               if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
5408 +                       queue_ast(lkb, GDLM_QUEUE_BLKAST, freq->rr_rqmode);
5409 +               break;
5410 +
5411 +       case GDLM_REMCMD_SENDCAST:
5412 +
5413 +               /* This is only used for some error completion ASTs */
5414 +
5415 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5416 +
5417 +               GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5418 +                                       freq->rr_remlkid,
5419 +                                       freq->rr_header.rh_lkid, nodeid););
5420 +
5421 +               /* Return the lock to granted status */
5422 +               res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
5423 +
5424 +               lkb->lkb_retstatus = freq->rr_status;
5425 +               queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
5426 +               break;
5427 +
5428 +       case GDLM_REMCMD_UNLOCKREQUEST:
5429 +
5430 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5431 +
5432 +               GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5433 +                                       freq->rr_remlkid,
5434 +                                       freq->rr_header.rh_lkid, nodeid););
5435 +
5436 +               reply.rl_status = dlm_unlock_stage2(lkb, freq->rr_flags);
5437 +               send_reply = 1;
5438 +               break;
5439 +
5440 +       case GDLM_REMCMD_QUERY:
5441 +               remote_query(nodeid, lspace, req);
5442 +               break;
5443 +
5444 +       case GDLM_REMCMD_QUERYREPLY:
5445 +               remote_query_reply(nodeid, lspace, req);
5446 +               break;
5447 +
5448 +       default:
5449 +               log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
5450 +       }
5451 +
5452 +       up_read(&lspace->ls_in_recovery);
5453 +
5454 +      out:
5455 +       if (send_reply) {
5456 +               reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5457 +               reply.rl_header.rh_flags = 0;
5458 +               reply.rl_header.rh_length = sizeof(reply);
5459 +               reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
5460 +               reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
5461 +
5462 +               status = midcomms_send_message(nodeid, &reply.rl_header,
5463 +                                              GFP_KERNEL);
5464 +       }
5465 +
5466 +       wake_astd();
5467 +
5468 +       return status;
5469 +}
5470 +
5471 +static void add_reply_lvb(gd_lkb_t *lkb, struct gd_remlockreply *reply)
5472 +{
5473 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5474 +               memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5475 +}
5476 +
5477 +static void add_request_lvb(gd_lkb_t *lkb, struct gd_remlockrequest *req)
5478 +{
5479 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5480 +               memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5481 +}
5482 diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
5483 --- linux-orig/cluster/dlm/lockqueue.h  1970-01-01 07:30:00.000000000 +0730
5484 +++ linux-patched/cluster/dlm/lockqueue.h       2004-06-25 18:31:07.000000000 +0800
5485 @@ -0,0 +1,29 @@
5486 +/******************************************************************************
5487 +*******************************************************************************
5488 +**
5489 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
5490 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
5491 +**
5492 +**  This copyrighted material is made available to anyone wishing to use,
5493 +**  modify, copy, or redistribute it subject to the terms and conditions
5494 +**  of the GNU General Public License v.2.
5495 +**
5496 +*******************************************************************************
5497 +******************************************************************************/
5498 +
5499 +#ifndef __LOCKQUEUE_DOT_H__
5500 +#define __LOCKQUEUE_DOT_H__
5501 +
5502 +void remote_grant(gd_lkb_t * lkb);
5503 +void reply_and_grant(gd_lkb_t * lkb);
5504 +int remote_stage(gd_lkb_t * lkb, int state);
5505 +int process_cluster_request(int csid, struct gd_req_header *req, int recovery);
5506 +int send_cluster_request(gd_lkb_t * lkb, int state);
5507 +void purge_requestqueue(gd_ls_t * ls);
5508 +int process_requestqueue(gd_ls_t * ls);
5509 +int reply_in_requestqueue(gd_ls_t * ls, int lkid);
5510 +void remote_remove_resdata(gd_ls_t * ls, int nodeid, char *name, int namelen,
5511 +                          uint8_t sequence);
5512 +void allocate_and_copy_lvb(gd_ls_t * ls, char **lvbptr, char *src);
5513 +
5514 +#endif                         /* __LOCKQUEUE_DOT_H__ */
5515 diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
5516 --- linux-orig/cluster/dlm/lockspace.c  1970-01-01 07:30:00.000000000 +0730
5517 +++ linux-patched/cluster/dlm/lockspace.c       2004-06-25 18:31:07.000000000 +0800
5518 @@ -0,0 +1,706 @@
5519 +/******************************************************************************
5520 +*******************************************************************************
5521 +**
5522 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
5523 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
5524 +**
5525 +**  This copyrighted material is made available to anyone wishing to use,
5526 +**  modify, copy, or redistribute it subject to the terms and conditions
5527 +**  of the GNU General Public License v.2.
5528 +**
5529 +*******************************************************************************
5530 +******************************************************************************/
5531 +
5532 +#include <linux/module.h>
5533 +
5534 +#include "dlm_internal.h"
5535 +#include "recoverd.h"
5536 +#include "ast.h"
5537 +#include "lkb.h"
5538 +#include "nodes.h"
5539 +#include "dir.h"
5540 +#include "lowcomms.h"
5541 +#include "config.h"
5542 +#include "memory.h"
5543 +#include "lockspace.h"
5544 +#include "device.h"
5545 +
5546 +#define GDST_NONE       (0)
5547 +#define GDST_RUNNING    (1)
5548 +
5549 +static int gdlmstate;
5550 +static int gdlmcount;
5551 +static struct semaphore gdlmstate_lock;
5552 +struct list_head lslist;
5553 +spinlock_t lslist_lock;
5554 +struct kcl_service_ops ls_ops;
5555 +
5556 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
5557 +
5558 +
5559 +void dlm_lockspace_init(void)
5560 +{
5561 +       gdlmstate = GDST_NONE;
5562 +       gdlmcount = 0;
5563 +       init_MUTEX(&gdlmstate_lock);
5564 +       INIT_LIST_HEAD(&lslist);
5565 +       spin_lock_init(&lslist_lock);
5566 +}
5567 +
5568 +gd_ls_t *find_lockspace_by_global_id(uint32_t id)
5569 +{
5570 +       gd_ls_t *ls;
5571 +
5572 +       spin_lock(&lslist_lock);
5573 +
5574 +       list_for_each_entry(ls, &lslist, ls_list) {
5575 +               if (ls->ls_global_id == id)
5576 +                       goto out;
5577 +       }
5578 +       ls = NULL;
5579 +      out:
5580 +       spin_unlock(&lslist_lock);
5581 +       return ls;
5582 +}
5583 +
5584 +/* TODO: make this more efficient */
5585 +gd_ls_t *find_lockspace_by_local_id(void *id)
5586 +{
5587 +       gd_ls_t *ls;
5588 +
5589 +       spin_lock(&lslist_lock);
5590 +
5591 +       list_for_each_entry(ls, &lslist, ls_list) {
5592 +               if (ls->ls_local_id == (uint32_t)(long)id)
5593 +                       goto out;
5594 +       }
5595 +       ls = NULL;
5596 +      out:
5597 +       spin_unlock(&lslist_lock);
5598 +       return ls;
5599 +}
5600 +
5601 +gd_ls_t *find_lockspace_by_name(char *name, int namelen)
5602 +{
5603 +       gd_ls_t *ls;
5604 +
5605 +       spin_lock(&lslist_lock);
5606 +
5607 +       list_for_each_entry(ls, &lslist, ls_list) {
5608 +               if (ls->ls_namelen == namelen &&
5609 +                   memcmp(ls->ls_name, name, namelen) == 0)
5610 +                       goto out;
5611 +       }
5612 +       ls = NULL;
5613 +      out:
5614 +       spin_unlock(&lslist_lock);
5615 +       return ls;
5616 +}
5617 +
5618 +/*
5619 + * Called from dlm_init.  These are the general threads which are not
5620 + * lockspace-specific and work for all gdlm lockspaces.
5621 + */
5622 +
5623 +static int threads_start(void)
5624 +{
5625 +       int error;
5626 +
5627 +       /* Thread which interacts with cman for all ls's */
5628 +       error = recoverd_start();
5629 +       if (error) {
5630 +               log_print("cannot start recovery thread %d", error);
5631 +               goto fail;
5632 +       }
5633 +
5634 +       /* Thread which process lock requests for all ls's */
5635 +       error = astd_start();
5636 +       if (error) {
5637 +               log_print("cannot start ast thread %d", error);
5638 +               goto recoverd_fail;
5639 +       }
5640 +
5641 +       /* Thread for sending/receiving messages for all ls's */
5642 +       error = lowcomms_start();
5643 +       if (error) {
5644 +               log_print("cannot start lowcomms %d", error);
5645 +               goto astd_fail;
5646 +       }
5647 +
5648 +       return 0;
5649 +
5650 +      astd_fail:
5651 +       astd_stop();
5652 +
5653 +      recoverd_fail:
5654 +       recoverd_stop();
5655 +
5656 +      fail:
5657 +       return error;
5658 +}
5659 +
5660 +static void threads_stop(void)
5661 +{
5662 +       lowcomms_stop();
5663 +       astd_stop();
5664 +       recoverd_stop();
5665 +}
5666 +
5667 +static int init_internal(void)
5668 +{
5669 +       int error = 0;
5670 +
5671 +       if (gdlmstate == GDST_RUNNING)
5672 +               gdlmcount++;
5673 +       else {
5674 +               error = threads_start();
5675 +               if (error)
5676 +                       goto out;
5677 +
5678 +               gdlmstate = GDST_RUNNING;
5679 +               gdlmcount = 1;
5680 +       }
5681 +
5682 +      out:
5683 +       return error;
5684 +}
5685 +
5686 +
5687 +/*
5688 + * Called after gdlm module is loaded and before any lockspaces are created.
5689 + * Starts and initializes global threads and structures.  These global entities
5690 + * are shared by and independent of all lockspaces.
5691 + *
5692 + * There should be a gdlm-specific user command which a person can run which
5693 + * calls this function.  If a user hasn't run that command and something
5694 + * creates a new lockspace, this is called first.
5695 + *
5696 + * This also starts the default lockspace.
5697 + */
5698 +
5699 +int dlm_init(void)
5700 +{
5701 +       int error;
5702 +
5703 +       down(&gdlmstate_lock);
5704 +       error = init_internal();
5705 +       up(&gdlmstate_lock);
5706 +
5707 +       return error;
5708 +}
5709 +
5710 +int dlm_release(void)
5711 +{
5712 +       int error = 0;
5713 +
5714 +       down(&gdlmstate_lock);
5715 +
5716 +       if (gdlmstate == GDST_NONE)
5717 +               goto out;
5718 +
5719 +       if (gdlmcount)
5720 +               gdlmcount--;
5721 +
5722 +       if (gdlmcount)
5723 +               goto out;
5724 +
5725 +       spin_lock(&lslist_lock);
5726 +       if (!list_empty(&lslist)) {
5727 +               spin_unlock(&lslist_lock);
5728 +               log_print("cannot stop threads, lockspaces still exist");
5729 +               goto out;
5730 +       }
5731 +       spin_unlock(&lslist_lock);
5732 +
5733 +       threads_stop();
5734 +       gdlmstate = GDST_NONE;
5735 +
5736 +      out:
5737 +       up(&gdlmstate_lock);
5738 +
5739 +       return error;
5740 +}
5741 +
5742 +gd_ls_t *allocate_ls(int namelen)
5743 +{
5744 +       gd_ls_t *ls;
5745 +
5746 +       /* FIXME: use appropriate malloc type */
5747 +
5748 +       ls = kmalloc(sizeof(gd_ls_t) + namelen, GFP_KERNEL);
5749 +       if (ls)
5750 +               memset(ls, 0, sizeof(gd_ls_t) + namelen);
5751 +
5752 +       return ls;
5753 +}
5754 +
5755 +void free_ls(gd_ls_t *ls)
5756 +{
5757 +       kfree(ls);
5758 +}
5759 +
5760 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
5761 +{
5762 +       gd_ls_t *ls;
5763 +       int i, error = -ENOMEM;
5764 +       uint32_t local_id = 0;
5765 +
5766 +       if (!try_module_get(THIS_MODULE))
5767 +               return -EINVAL;
5768 +
5769 +       if (namelen > MAX_SERVICE_NAME_LEN)
5770 +               return -EINVAL;
5771 +
5772 +       if ((ls = find_lockspace_by_name(name, namelen))) {
5773 +               *lockspace = (void *)ls->ls_local_id;
5774 +               return -EEXIST;
5775 +       }
5776 +
5777 +       /*
5778 +        * Initialize ls fields
5779 +        */
5780 +
5781 +       ls = allocate_ls(namelen);
5782 +       if (!ls)
5783 +               goto out;
5784 +
5785 +       memcpy(ls->ls_name, name, namelen);
5786 +       ls->ls_namelen = namelen;
5787 +
5788 +       ls->ls_allocation = GFP_KERNEL;
5789 +       memset(&ls->ls_flags, 0, sizeof(unsigned long));
5790 +       INIT_LIST_HEAD(&ls->ls_rootres);
5791 +       ls->ls_hashsize = dlm_config.reshashtbl;
5792 +       ls->ls_hashmask = ls->ls_hashsize - 1;
5793 +
5794 +       ls->ls_reshashtbl =
5795 +           kmalloc(sizeof(struct list_head) * ls->ls_hashsize, GFP_KERNEL);
5796 +       if (!ls->ls_reshashtbl)
5797 +               goto out_lsfree;
5798 +
5799 +       for (i = 0; i < ls->ls_hashsize; i++)
5800 +               INIT_LIST_HEAD(&ls->ls_reshashtbl[i]);
5801 +
5802 +       rwlock_init(&ls->ls_reshash_lock);
5803 +
5804 +       if (init_lockidtbl(ls, dlm_config.lockidtbl) == -1)
5805 +               goto out_htfree;
5806 +
5807 +       INIT_LIST_HEAD(&ls->ls_nodes);
5808 +       ls->ls_num_nodes = 0;
5809 +       INIT_LIST_HEAD(&ls->ls_nodes_gone);
5810 +       INIT_LIST_HEAD(&ls->ls_recover);
5811 +       spin_lock_init(&ls->ls_recover_lock);
5812 +       INIT_LIST_HEAD(&ls->ls_recover_list);
5813 +       ls->ls_recover_list_count = 0;
5814 +       spin_lock_init(&ls->ls_recover_list_lock);
5815 +       init_waitqueue_head(&ls->ls_wait_general);
5816 +       INIT_LIST_HEAD(&ls->ls_requestqueue);
5817 +       INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
5818 +       ls->ls_last_stop = 0;
5819 +       ls->ls_last_start = 0;
5820 +       ls->ls_last_finish = 0;
5821 +       ls->ls_rcom_msgid = 0;
5822 +       init_MUTEX(&ls->ls_rcom_lock);
5823 +       init_rwsem(&ls->ls_in_recovery);
5824 +       init_rwsem(&ls->ls_unlock_sem);
5825 +       init_rwsem(&ls->ls_rec_rsblist);
5826 +       init_rwsem(&ls->ls_gap_rsblist);
5827 +       down_write(&ls->ls_in_recovery);
5828 +
5829 +       for (i = 0; i < RESDIRHASH_SIZE; i++) {
5830 +               INIT_LIST_HEAD(&ls->ls_resdir_hash[i].rb_reslist);
5831 +               rwlock_init(&ls->ls_resdir_hash[i].rb_lock);
5832 +       }
5833 +
5834 +       if (flags & DLM_LSF_NOTIMERS)
5835 +               set_bit(LSFL_NOTIMERS, &ls->ls_flags);
5836 +
5837 +       /*
5838 +        * Connect this lockspace with the cluster manager
5839 +        */
5840 +
5841 +       error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
5842 +                                    &ls_ops, TRUE, (void *) ls, &local_id);
5843 +       if (error)
5844 +               goto out_idtblfree;
5845 +
5846 +       ls->ls_state = LSST_INIT;
5847 +       ls->ls_local_id = local_id;
5848 +
5849 +       spin_lock(&lslist_lock);
5850 +       list_add(&ls->ls_list, &lslist);
5851 +       spin_unlock(&lslist_lock);
5852 +
5853 +       error = kcl_join_service(local_id);
5854 +       if (error) {
5855 +               log_error(ls, "service manager join error %d", error);
5856 +               goto out_reg;
5857 +       }
5858 +
5859 +       /* The ls isn't actually running until it receives a start() from CMAN.
5860 +        * Neither does it have a global ls id until started. */
5861 +
5862 +
5863 +       /* Return the local ID as the lockspace handle. I've left this
5864 +          cast to a void* as it allows us to replace it with pretty much
5865 +          anything at a future date without breaking clients. But returning
5866 +          the address of the lockspace is a bad idea as it could get
5867 +          forcibly removed, leaving client with a dangling pointer */
5868 +       *lockspace = (void *)local_id;
5869 +
5870 +       return 0;
5871 +
5872 +      out_reg:
5873 +       kcl_unregister_service(ls->ls_local_id);
5874 +
5875 +      out_idtblfree:
5876 +       free_lockidtbl(ls);
5877 +
5878 +      out_htfree:
5879 +       kfree(ls->ls_reshashtbl);
5880 +
5881 +      out_lsfree:
5882 +       free_ls(ls);
5883 +
5884 +      out:
5885 +       return error;
5886 +}
5887 +
5888 +/*
5889 + * Called by a system like GFS which wants independent lock spaces.
5890 + */
5891 +
5892 +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
5893 +{
5894 +       int error = -ENOSYS;
5895 +
5896 +       down(&gdlmstate_lock);
5897 +
5898 +       error = init_internal();
5899 +       if (error)
5900 +               goto out;
5901 +
5902 +       error = new_lockspace(name, namelen, lockspace, flags);
5903 +
5904 +      out:
5905 +       up(&gdlmstate_lock);
5906 +
5907 +       return error;
5908 +}
5909 +
5910 +/* Return 1 if the lockspace still has active remote locks,
5911 + *        2 if the lockspace still has active local locks.
5912 + */
5913 +static int lockspace_busy(gd_ls_t *ls)
5914 +{
5915 +    int i;
5916 +    int lkb_found = 0;
5917 +    gd_lkb_t *lkb;
5918 +
5919 +    /* NOTE: We check the lockidtbl here rather than the resource table.
5920 +     * This is because there may be LKBs queued as ASTs that have been unlinked
5921 +     * from their RSBs and are pending deletion once the AST has been delivered
5922 +     */
5923 +    read_lock(&ls->ls_lockidtbl_lock);
5924 +    for (i = 0; i < ls->ls_lockidtbl_size; i++) {
5925 +       if (!list_empty(&ls->ls_lockidtbl[i].list)) {
5926 +           lkb_found = 1;
5927 +           list_for_each_entry(lkb, &ls->ls_lockidtbl[i].list, lkb_idtbl_list) {
5928 +               if (!lkb->lkb_nodeid) {
5929 +                   read_unlock(&ls->ls_lockidtbl_lock);
5930 +                   return 2;
5931 +               }
5932 +           }
5933 +       }
5934 +    }
5935 +    read_unlock(&ls->ls_lockidtbl_lock);
5936 +    return lkb_found;
5937 +}
5938 +
5939 +/* Actually release the lockspace */
5940 +static int release_lockspace(gd_ls_t *ls, int force)
5941 +{
5942 +       gd_lkb_t *lkb;
5943 +       gd_res_t *rsb;
5944 +       gd_recover_t *gr;
5945 +       gd_csb_t *csb;
5946 +       struct list_head *head;
5947 +       int i;
5948 +       int busy = lockspace_busy(ls);
5949 +
5950 +       /* Don't destroy a busy lockspace */
5951 +       if (busy > force)
5952 +               return -EBUSY;
5953 +
5954 +       if (force < 3) {
5955 +               kcl_leave_service(ls->ls_local_id);
5956 +               kcl_unregister_service(ls->ls_local_id);
5957 +       }
5958 +
5959 +       spin_lock(&lslist_lock);
5960 +       list_del(&ls->ls_list);
5961 +       spin_unlock(&lslist_lock);
5962 +
5963 +       /*
5964 +        * Free resdata structs.
5965 +        */
5966 +
5967 +       resdir_clear(ls);
5968 +
5969 +       /*
5970 +        * Free all lkb's on lockidtbl[] lists.
5971 +        */
5972 +
5973 +       for (i = 0; i < ls->ls_lockidtbl_size; i++) {
5974 +               head = &ls->ls_lockidtbl[i].list;
5975 +               while (!list_empty(head)) {
5976 +                       lkb = list_entry(head->next, gd_lkb_t, lkb_idtbl_list);
5977 +                       list_del(&lkb->lkb_idtbl_list);
5978 +
5979 +                       if (lkb->lkb_lockqueue_state)
5980 +                               remove_from_lockqueue(lkb);
5981 +
5982 +                       if (lkb->lkb_asts_to_deliver)
5983 +                               list_del(&lkb->lkb_astqueue);
5984 +
5985 +                       if (lkb->lkb_lvbptr
5986 +                           && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
5987 +                               free_lvb(lkb->lkb_lvbptr);
5988 +
5989 +                       free_lkb(lkb);
5990 +               }
5991 +       }
5992 +
5993 +       /*
5994 +        * Free lkidtbl[] itself
5995 +        */
5996 +
5997 +       kfree(ls->ls_lockidtbl);
5998 +
5999 +       /*
6000 +        * Free all rsb's on reshashtbl[] lists
6001 +        */
6002 +
6003 +       for (i = 0; i < ls->ls_hashsize; i++) {
6004 +               head = &ls->ls_reshashtbl[i];
6005 +               while (!list_empty(head)) {
6006 +                       rsb = list_entry(head->next, gd_res_t, res_hashchain);
6007 +                       list_del(&rsb->res_hashchain);
6008 +
6009 +                       if (rsb->res_lvbptr)
6010 +                               free_lvb(rsb->res_lvbptr);
6011 +
6012 +                       free_rsb(rsb);
6013 +               }
6014 +       }
6015 +
6016 +       /*
6017 +        * Free reshashtbl[] itself
6018 +        */
6019 +
6020 +       kfree(ls->ls_reshashtbl);
6021 +
6022 +       /*
6023 +        * Free structures on any other lists
6024 +        */
6025 +
6026 +       head = &ls->ls_recover;
6027 +       while (!list_empty(head)) {
6028 +               gr = list_entry(head->next, gd_recover_t, gr_list);
6029 +               list_del(&gr->gr_list);
6030 +               free_dlm_recover(gr);
6031 +       }
6032 +
6033 +       head = &ls->ls_nodes;
6034 +       while (!list_empty(head)) {
6035 +               csb = list_entry(head->next, gd_csb_t, csb_list);
6036 +               list_del(&csb->csb_list);
6037 +               release_csb(csb);
6038 +       }
6039 +
6040 +       head = &ls->ls_nodes_gone;
6041 +       while (!list_empty(head)) {
6042 +               csb = list_entry(head->next, gd_csb_t, csb_list);
6043 +               list_del(&csb->csb_list);
6044 +               release_csb(csb);
6045 +       }
6046 +
6047 +       free_ls(ls);
6048 +
6049 +       dlm_release();
6050 +
6051 +       module_put(THIS_MODULE);
6052 +       return 0;
6053 +}
6054 +
6055 +
6056 +/*
6057 + * Called when a system has released all its locks and is not going to use the
6058 + * lockspace any longer.  We blindly free everything we're managing for this
6059 + * lockspace.  Remaining nodes will go through the recovery process as if we'd
6060 + * died.  The lockspace must continue to function as usual, participating in
6061 + * recoveries, until kcl_leave_service returns.
6062 + *
6063 + * Force has 4 possible values:
6064 + * 0 - don't destroy locksapce if it has any LKBs
6065 + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6066 + * 2 - destroy lockspace regardless of LKBs
6067 + * 3 - destroy lockspace as part of a forced shutdown
6068 + */
6069 +
6070 +int dlm_release_lockspace(void *lockspace, int force)
6071 +{
6072 +       gd_ls_t *ls;
6073 +
6074 +       ls = find_lockspace_by_local_id(lockspace);
6075 +       if (!ls)
6076 +           return -EINVAL;
6077 +
6078 +       return release_lockspace(ls, force);
6079 +}
6080 +
6081 +
6082 +/* Called when the cluster is being shut down dirtily */
6083 +void dlm_emergency_shutdown()
6084 +{
6085 +       gd_ls_t *ls;
6086 +       gd_ls_t *tmp;
6087 +
6088 +       /* Shut lowcomms down to prevent any socket activity */
6089 +       lowcomms_stop_accept();
6090 +
6091 +       /* Delete the devices that belong the the userland
6092 +          lockspaces to be deleted. */
6093 +       dlm_device_free_devices();
6094 +
6095 +       /* Now try to clean the lockspaces */
6096 +       spin_lock(&lslist_lock);
6097 +
6098 +       list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6099 +               spin_unlock(&lslist_lock);
6100 +               release_lockspace(ls, 3);
6101 +               spin_lock(&lslist_lock);
6102 +       }
6103 +
6104 +       spin_unlock(&lslist_lock);
6105 +}
6106 +
6107 +gd_recover_t *allocate_dlm_recover(void)
6108 +{
6109 +       gd_recover_t *gr;
6110 +
6111 +       gr = (gd_recover_t *) kmalloc(sizeof(gd_recover_t), GFP_KERNEL);
6112 +       if (gr)
6113 +               memset(gr, 0, sizeof(gd_recover_t));
6114 +
6115 +       return gr;
6116 +}
6117 +
6118 +void free_dlm_recover(gd_recover_t * gr)
6119 +{
6120 +       kfree(gr);
6121 +}
6122 +
6123 +/*
6124 + * Called by CMAN on a specific ls.  "stop" means set flag which while set
6125 + * causes all new requests to ls to be queued and not submitted until flag is
6126 + * cleared.  stop on a ls also needs to cancel any prior starts on the ls.
6127 + * The recoverd thread carries out any work called for by this event.
6128 + */
6129 +
6130 +static int dlm_ls_stop(void *servicedata)
6131 +{
6132 +       gd_ls_t *ls = (gd_ls_t *) servicedata;
6133 +       int new;
6134 +
6135 +       spin_lock(&ls->ls_recover_lock);
6136 +       ls->ls_last_stop = ls->ls_last_start;
6137 +       set_bit(LSFL_LS_STOP, &ls->ls_flags);
6138 +       new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6139 +       spin_unlock(&ls->ls_recover_lock);
6140 +
6141 +       /*
6142 +        * This in_recovery lock does two things:
6143 +        *
6144 +        * 1) Keeps this function from returning until all threads are out
6145 +        *    of locking routines and locking is truely stopped.
6146 +        * 2) Keeps any new requests from being processed until it's unlocked
6147 +        *    when recovery is complete.
6148 +        */
6149 +
6150 +       if (new)
6151 +               down_write(&ls->ls_in_recovery);
6152 +
6153 +       clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6154 +       clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6155 +       clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6156 +       clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6157 +
6158 +       recoverd_kick(ls);
6159 +
6160 +       return 0;
6161 +}
6162 +
6163 +/*
6164 + * Called by CMAN on a specific ls.  "start" means enable the lockspace to do
6165 + * request processing which first requires that the recovery procedure be
6166 + * stepped through with all nodes sharing the lockspace (nodeids).  The first
6167 + * start on the ls after it's created is a special case and requires some extra
6168 + * work like figuring out our own local nodeid.  We can't do all this in the
6169 + * calling CMAN context, so we must pass this work off to the recoverd thread
6170 + * which was created in gdlm_init().  The recoverd thread carries out any work
6171 + * called for by this event.
6172 + */
6173 +
6174 +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6175 +                       int event_id, int type)
6176 +{
6177 +       gd_ls_t *ls = (gd_ls_t *) servicedata;
6178 +       gd_recover_t *gr;
6179 +       int error = -ENOMEM;
6180 +
6181 +       gr = allocate_dlm_recover();
6182 +       if (!gr)
6183 +               goto out;
6184 +
6185 +       gr->gr_nodeids = nodeids;
6186 +       gr->gr_node_count = count;
6187 +       gr->gr_event_id = event_id;
6188 +
6189 +       spin_lock(&ls->ls_recover_lock);
6190 +       ls->ls_last_start = event_id;
6191 +       list_add_tail(&gr->gr_list, &ls->ls_recover);
6192 +       set_bit(LSFL_LS_START, &ls->ls_flags);
6193 +       spin_unlock(&ls->ls_recover_lock);
6194 +
6195 +       recoverd_kick(ls);
6196 +       error = 0;
6197 +
6198 +      out:
6199 +       return error;
6200 +}
6201 +
6202 +/*
6203 + * Called by CMAN on a specific ls.  "finish" means that all nodes which
6204 + * received a "start" have completed the start and called kcl_start_done.
6205 + * The recoverd thread carries out any work called for by this event.
6206 + */
6207 +
6208 +static void dlm_ls_finish(void *servicedata, int event_id)
6209 +{
6210 +       gd_ls_t *ls = (gd_ls_t *) servicedata;
6211 +
6212 +       spin_lock(&ls->ls_recover_lock);
6213 +       ls->ls_last_finish = event_id;
6214 +       set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6215 +       spin_unlock(&ls->ls_recover_lock);
6216 +
6217 +       recoverd_kick(ls);
6218 +}
6219 +
6220 +struct kcl_service_ops ls_ops = {
6221 +       .stop = dlm_ls_stop,
6222 +       .start = dlm_ls_start,
6223 +       .finish = dlm_ls_finish
6224 +};
6225 diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6226 --- linux-orig/cluster/dlm/lockspace.h  1970-01-01 07:30:00.000000000 +0730
6227 +++ linux-patched/cluster/dlm/lockspace.h       2004-06-25 18:31:07.000000000 +0800
6228 @@ -0,0 +1,29 @@
6229 +/******************************************************************************
6230 +*******************************************************************************
6231 +**
6232 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6233 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6234 +**
6235 +**  This copyrighted material is made available to anyone wishing to use,
6236 +**  modify, copy, or redistribute it subject to the terms and conditions
6237 +**  of the GNU General Public License v.2.
6238 +**
6239 +*******************************************************************************
6240 +******************************************************************************/
6241 +
6242 +#ifndef __LOCKSPACE_DOT_H__
6243 +#define __LOCKSPACE_DOT_H__
6244 +
6245 +void dlm_lockspace_init(void);
6246 +int dlm_init(void);
6247 +int dlm_release(void);
6248 +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6249 +int dlm_release_lockspace(void *ls, int force);
6250 +gd_ls_t *find_lockspace_by_global_id(uint32_t id);
6251 +gd_ls_t *find_lockspace_by_local_id(void *id);
6252 +gd_ls_t *find_lockspace_by_name(char *name, int namelen);
6253 +void free_dlm_recover(gd_recover_t *gr);
6254 +int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out);
6255 +void dlm_emergency_shutdown(void);
6256 +
6257 +#endif                         /* __LOCKSPACE_DOT_H__ */
6258 diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6259 --- linux-orig/cluster/dlm/lowcomms.c   1970-01-01 07:30:00.000000000 +0730
6260 +++ linux-patched/cluster/dlm/lowcomms.c        2004-06-25 18:31:07.000000000 +0800
6261 @@ -0,0 +1,1354 @@
6262 +/******************************************************************************
6263 +*******************************************************************************
6264 +**
6265 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6266 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6267 +**
6268 +**  This copyrighted material is made available to anyone wishing to use,
6269 +**  modify, copy, or redistribute it subject to the terms and conditions
6270 +**  of the GNU General Public License v.2.
6271 +**
6272 +*******************************************************************************
6273 +******************************************************************************/
6274 +
6275 +/*
6276 + * lowcomms.c
6277 + *
6278 + * This is the "low-level" comms layer.
6279 + *
6280 + * It is responsible for sending/receiving messages
6281 + * from other nodes in the cluster.
6282 + *
6283 + * Cluster nodes are referred to by their nodeids. nodeids are
6284 + * simply 32 bit numbers to the locking module - if they need to
6285 + * be expanded for the cluster infrastructure then that is it's
6286 + * responsibility. It is this layer's
6287 + * responsibility to resolve these into IP address or
6288 + * whatever it needs for inter-node communication.
6289 + *
6290 + * The comms level is two kernel threads that deal mainly with
6291 + * the receiving of messages from other nodes and passing them
6292 + * up to the mid-level comms layer (which understands the
6293 + * message format) for execution by the locking core, and
6294 + * a send thread which does all the setting up of connections
6295 + * to remote nodes and the sending of data. Threads are not allowed
6296 + * to send their own data because it may cause them to wait in times
6297 + * of high load. Also, this way, the sending thread can collect together
6298 + * messages bound for one node and send them in one block.
6299 + *
6300 + * I don't see any problem with the recv thread executing the locking
6301 + * code on behalf of remote processes as the locking code is
6302 + * short, efficient and never waits.
6303 + *
6304 + */
6305 +
6306 +
6307 +#include <asm/ioctls.h>
6308 +#include <net/sock.h>
6309 +#include <net/tcp.h>
6310 +#include <linux/pagemap.h>
6311 +#include <cluster/cnxman.h>
6312 +
6313 +#include "dlm_internal.h"
6314 +#include "lowcomms.h"
6315 +#include "midcomms.h"
6316 +#include "config.h"
6317 +
6318 +struct cbuf {
6319 +       unsigned base;
6320 +       unsigned len;
6321 +       unsigned mask;
6322 +};
6323 +
6324 +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6325 +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6326 +#define CBUF_EMPTY(cb) ((cb)->len == 0)
6327 +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6328 +#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
6329 +                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6330 +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6331 +
6332 +struct connection {
6333 +       struct socket *sock;    /* NULL if not connected */
6334 +       uint32_t nodeid;        /* So we know who we are in the list */
6335 +       struct rw_semaphore sock_sem;   /* Stop connect races */
6336 +       struct list_head read_list;     /* On this list when ready for reading */
6337 +       struct list_head write_list;    /* On this list when ready for writing */
6338 +       struct list_head state_list;    /* On this list when ready to connect */
6339 +       unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
6340 +#define CF_READ_PENDING 1
6341 +#define CF_WRITE_PENDING 2
6342 +#define CF_CONNECT_PENDING 3
6343 +#define CF_IS_OTHERSOCK 4
6344 +       struct list_head writequeue;    /* List of outgoing writequeue_entries */
6345 +       struct list_head listenlist;    /* List of allocated listening sockets */
6346 +       spinlock_t writequeue_lock;
6347 +       int (*rx_action) (struct connection *); /* What to do when active */
6348 +       struct page *rx_page;
6349 +       struct cbuf cb;
6350 +       int retries;
6351 +#define MAX_CONNECT_RETRIES 3
6352 +       struct connection *othersock;
6353 +};
6354 +#define sock2con(x) ((struct connection *)(x)->sk_user_data)
6355 +#define nodeid2con(x) (&connections[(x)])
6356 +
6357 +/* An entry waiting to be sent */
6358 +struct writequeue_entry {
6359 +       struct list_head list;
6360 +       struct page *page;
6361 +       int offset;
6362 +       int len;
6363 +       int end;
6364 +       int users;
6365 +       struct connection *con;
6366 +};
6367 +
6368 +/* "Template" structure for IPv4 and IPv6 used to fill
6369 + * in the missing bits when converting between cman (which knows
6370 + * nothing about sockaddr structs) and real life where we actually
6371 + * have to connect to these addresses. Also one of these structs
6372 + * will hold the cached "us" address.
6373 + *
6374 + * It's an in6 sockaddr just so there's enough space for anything
6375 + * we're likely to see here.
6376 + */
6377 +static struct sockaddr_in6 local_addr;
6378 +
6379 +/* Manage daemons */
6380 +static struct semaphore thread_lock;
6381 +static struct completion thread_completion;
6382 +static atomic_t send_run;
6383 +static atomic_t recv_run;
6384 +
6385 +/* An array of connections, indexed by NODEID */
6386 +static struct connection *connections;
6387 +static int conn_array_size;
6388 +static atomic_t writequeue_length;
6389 +static atomic_t accepting;
6390 +
6391 +static wait_queue_t lowcomms_send_waitq_head;
6392 +static wait_queue_head_t lowcomms_send_waitq;
6393 +
6394 +static wait_queue_t lowcomms_recv_waitq_head;
6395 +static wait_queue_head_t lowcomms_recv_waitq;
6396 +
6397 +/* List of sockets that have reads pending */
6398 +static struct list_head read_sockets;
6399 +static spinlock_t read_sockets_lock;
6400 +
6401 +/* List of sockets which have writes pending */
6402 +static struct list_head write_sockets;
6403 +static spinlock_t write_sockets_lock;
6404 +
6405 +/* List of sockets which have connects pending */
6406 +static struct list_head state_sockets;
6407 +static spinlock_t state_sockets_lock;
6408 +
6409 +/* List of allocated listen sockets */
6410 +static struct list_head listen_sockets;
6411 +
6412 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6413 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6414 +
6415 +
6416 +/* Data available on socket or listen socket received a connect */
6417 +static void lowcomms_data_ready(struct sock *sk, int count_unused)
6418 +{
6419 +       struct connection *con = sock2con(sk);
6420 +
6421 +       if (test_and_set_bit(CF_READ_PENDING, &con->flags))
6422 +               return;
6423 +
6424 +       spin_lock_bh(&read_sockets_lock);
6425 +       list_add_tail(&con->read_list, &read_sockets);
6426 +       spin_unlock_bh(&read_sockets_lock);
6427 +
6428 +       wake_up_interruptible(&lowcomms_recv_waitq);
6429 +}
6430 +
6431 +static void lowcomms_write_space(struct sock *sk)
6432 +{
6433 +       struct connection *con = sock2con(sk);
6434 +
6435 +       if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
6436 +               return;
6437 +
6438 +       spin_lock_bh(&write_sockets_lock);
6439 +       list_add_tail(&con->write_list, &write_sockets);
6440 +       spin_unlock_bh(&write_sockets_lock);
6441 +
6442 +       wake_up_interruptible(&lowcomms_send_waitq);
6443 +}
6444 +
6445 +static inline void lowcomms_connect_sock(struct connection *con)
6446 +{
6447 +       if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
6448 +               return;
6449 +       if (!atomic_read(&accepting))
6450 +               return;
6451 +
6452 +       spin_lock_bh(&state_sockets_lock);
6453 +       list_add_tail(&con->state_list, &state_sockets);
6454 +       spin_unlock_bh(&state_sockets_lock);
6455 +
6456 +       wake_up_interruptible(&lowcomms_send_waitq);
6457 +}
6458 +
6459 +static void lowcomms_state_change(struct sock *sk)
6460 +{
6461 +/*     struct connection *con = sock2con(sk); */
6462 +
6463 +       switch (sk->sk_state) {
6464 +       case TCP_ESTABLISHED:
6465 +               lowcomms_write_space(sk);
6466 +               break;
6467 +
6468 +       case TCP_FIN_WAIT1:
6469 +       case TCP_FIN_WAIT2:
6470 +       case TCP_TIME_WAIT:
6471 +       case TCP_CLOSE:
6472 +       case TCP_CLOSE_WAIT:
6473 +       case TCP_LAST_ACK:
6474 +       case TCP_CLOSING:
6475 +               /* FIXME: I think this causes more trouble than it solves.
6476 +                  lowcomms wil reconnect anyway when there is something to
6477 +                  send. This just attempts reconnection if a node goes down!
6478 +               */
6479 +               /* lowcomms_connect_sock(con); */
6480 +               break;
6481 +
6482 +       default:
6483 +               printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
6484 +               break;
6485 +       }
6486 +}
6487 +
6488 +/* Make a socket active */
6489 +static int add_sock(struct socket *sock, struct connection *con)
6490 +{
6491 +       con->sock = sock;
6492 +
6493 +       /* Install a data_ready callback */
6494 +       con->sock->sk->sk_data_ready = lowcomms_data_ready;
6495 +       con->sock->sk->sk_write_space = lowcomms_write_space;
6496 +       con->sock->sk->sk_state_change = lowcomms_state_change;
6497 +
6498 +       return 0;
6499 +}
6500 +
6501 +/* Add the port number to an IP6 or 4 sockaddr and return the address
6502 +   length */
6503 +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
6504 +                         int *addr_len)
6505 +{
6506 +        saddr->sin6_family = local_addr.sin6_family;
6507 +        if (local_addr.sin6_family == AF_INET) {
6508 +           struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
6509 +           in4_addr->sin_port = cpu_to_be16(port);
6510 +           *addr_len = sizeof(struct sockaddr_in);
6511 +       }
6512 +       else {
6513 +           saddr->sin6_port = cpu_to_be16(port);
6514 +           *addr_len = sizeof(struct sockaddr_in6);
6515 +       }
6516 +}
6517 +
6518 +/* Close a remote connection and tidy up */
6519 +static void close_connection(struct connection *con)
6520 +{
6521 +       if (test_bit(CF_IS_OTHERSOCK, &con->flags))
6522 +               return;
6523 +
6524 +       down_write(&con->sock_sem);
6525 +
6526 +       if (con->sock) {
6527 +               sock_release(con->sock);
6528 +               con->sock = NULL;
6529 +               if (con->othersock) {
6530 +                       down_write(&con->othersock->sock_sem);
6531 +                       sock_release(con->othersock->sock);
6532 +                       con->othersock->sock = NULL;
6533 +                       up_write(&con->othersock->sock_sem);
6534 +                       kfree(con->othersock);
6535 +                       con->othersock = NULL;
6536 +               }
6537 +       }
6538 +       if (con->rx_page) {
6539 +               __free_page(con->rx_page);
6540 +               con->rx_page = NULL;
6541 +       }
6542 +       up_write(&con->sock_sem);
6543 +}
6544 +
6545 +/* Data received from remote end */
6546 +static int receive_from_sock(struct connection *con)
6547 +{
6548 +       int ret = 0;
6549 +       struct msghdr msg;
6550 +       struct iovec iov[2];
6551 +       mm_segment_t fs;
6552 +       unsigned len;
6553 +       int r;
6554 +       int call_again_soon = 0;
6555 +
6556 +       down_read(&con->sock_sem);
6557 +
6558 +       if (con->sock == NULL)
6559 +               goto out;
6560 +       if (con->rx_page == NULL) {
6561 +               /*
6562 +                * This doesn't need to be atomic, but I think it should
6563 +                * improve performance if it is.
6564 +                */
6565 +               con->rx_page = alloc_page(GFP_ATOMIC);
6566 +               if (con->rx_page == NULL)
6567 +                       goto out_resched;
6568 +               CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
6569 +       }
6570 +       /*
6571 +        * To avoid doing too many short reads, we will reschedule for another
6572 +        * another time if there are less than 32 bytes left in the buffer.
6573 +        */
6574 +       if (!CBUF_MAY_ADD(&con->cb, 32))
6575 +               goto out_resched;
6576 +
6577 +       msg.msg_control = NULL;
6578 +       msg.msg_controllen = 0;
6579 +       msg.msg_iovlen = 1;
6580 +       msg.msg_iov = iov;
6581 +       msg.msg_name = NULL;
6582 +       msg.msg_namelen = 0;
6583 +       msg.msg_flags = 0;
6584 +
6585 +       /*
6586 +        * iov[0] is the bit of the circular buffer between the current end
6587 +        * point (cb.base + cb.len) and the end of the buffer.
6588 +        */
6589 +       iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
6590 +       iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
6591 +       iov[1].iov_len = 0;
6592 +
6593 +       /*
6594 +        * iov[1] is the bit of the circular buffer between the start of the
6595 +        * buffer and the start of the currently used section (cb.base)
6596 +        */
6597 +       if (CBUF_DATA(&con->cb) >= con->cb.base) {
6598 +               iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
6599 +               iov[1].iov_len = con->cb.base;
6600 +               iov[1].iov_base = page_address(con->rx_page);
6601 +               msg.msg_iovlen = 2;
6602 +       }
6603 +       len = iov[0].iov_len + iov[1].iov_len;
6604 +
6605 +       fs = get_fs();
6606 +       set_fs(get_ds());
6607 +       r = ret = sock_recvmsg(con->sock, &msg, len,
6608 +                              MSG_DONTWAIT | MSG_NOSIGNAL);
6609 +       set_fs(fs);
6610 +
6611 +       if (ret <= 0)
6612 +               goto out_close;
6613 +       if (ret == len)
6614 +               call_again_soon = 1;
6615 +       CBUF_ADD(&con->cb, ret);
6616 +       ret = midcomms_process_incoming_buffer(con->nodeid,
6617 +                                              page_address(con->rx_page),
6618 +                                              con->cb.base, con->cb.len,
6619 +                                              PAGE_CACHE_SIZE);
6620 +       if (ret == -EBADMSG) {
6621 +               printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
6622 +                      "iov_len=%u, iov_base[0]=%p, read=%d\n",
6623 +                      page_address(con->rx_page), con->cb.base, con->cb.len,
6624 +                      len, iov[0].iov_base, r);
6625 +       }
6626 +       if (ret < 0)
6627 +               goto out_close;
6628 +       CBUF_EAT(&con->cb, ret);
6629 +
6630 +       if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
6631 +               __free_page(con->rx_page);
6632 +               con->rx_page = NULL;
6633 +       }
6634 +      out:
6635 +       if (call_again_soon)
6636 +               goto out_resched;
6637 +       up_read(&con->sock_sem);
6638 +       ret = 0;
6639 +       goto out_ret;
6640 +
6641 +      out_resched:
6642 +       lowcomms_data_ready(con->sock->sk, 0);
6643 +       up_read(&con->sock_sem);
6644 +       ret = 0;
6645 +       goto out_ret;
6646 +
6647 +      out_close:
6648 +       up_read(&con->sock_sem);
6649 +       if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6650 +               close_connection(con);
6651 +               lowcomms_connect_sock(con);
6652 +       }
6653 +
6654 +      out_ret:
6655 +       return ret;
6656 +}
6657 +
6658 +/* Listening socket is busy, accept a connection */
6659 +static int accept_from_sock(struct connection *con)
6660 +{
6661 +       int result;
6662 +       struct sockaddr_in6 peeraddr;
6663 +       struct socket *newsock;
6664 +       int len;
6665 +       int nodeid;
6666 +       struct connection *newcon;
6667 +
6668 +       memset(&peeraddr, 0, sizeof(peeraddr));
6669 +       newsock = sock_alloc();
6670 +       if (!newsock)
6671 +               return -ENOMEM;
6672 +
6673 +       down_read(&con->sock_sem);
6674 +
6675 +       result = -ENOTCONN;
6676 +       if (con->sock == NULL)
6677 +               goto accept_err;
6678 +
6679 +       newsock->type = con->sock->type;
6680 +       newsock->ops = con->sock->ops;
6681 +
6682 +       result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
6683 +       if (result < 0)
6684 +               goto accept_err;
6685 +
6686 +       /* Get the connected socket's peer */
6687 +       if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
6688 +                                 &len, 2)) {
6689 +               result = -ECONNABORTED;
6690 +               goto accept_err;
6691 +       }
6692 +
6693 +       /* Get the new node's NODEID */
6694 +       nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
6695 +       if (nodeid == 0) {
6696 +               printk("dlm: connect from non cluster node\n");
6697 +               sock_release(newsock);
6698 +               up_read(&con->sock_sem);
6699 +               return -1;
6700 +       }
6701 +
6702 +       log_print("got connection from %d", nodeid);
6703 +
6704 +       /*  Check to see if we already have a connection to this node. This
6705 +        *  could happen if the two nodes initiate a connection at roughly
6706 +        *  the same time and the connections cross on the wire.
6707 +        * TEMPORARY FIX:
6708 +        *  In this case we store the incoming one in "othersock"
6709 +        */
6710 +       newcon = nodeid2con(nodeid);
6711 +       down_write(&newcon->sock_sem);
6712 +       if (newcon->sock) {
6713 +               struct connection *othercon;
6714 +
6715 +               othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
6716 +               if (!othercon) {
6717 +                       printk("dlm: failed to allocate incoming socket\n");
6718 +                       sock_release(newsock);
6719 +                       up_write(&newcon->sock_sem);
6720 +                       up_read(&con->sock_sem);
6721 +                       goto accept_out;
6722 +               }
6723 +               memset(othercon, 0, sizeof(*othercon));
6724 +               newcon->othersock = othercon;
6725 +               othercon->nodeid = nodeid;
6726 +               othercon->sock = newsock;
6727 +               othercon->rx_action = receive_from_sock;
6728 +               add_sock(newsock, othercon);
6729 +               init_rwsem(&othercon->sock_sem);
6730 +               set_bit(CF_IS_OTHERSOCK, &othercon->flags);
6731 +               newsock->sk->sk_user_data = othercon;
6732 +
6733 +               up_write(&newcon->sock_sem);
6734 +               lowcomms_data_ready(newsock->sk, 0);
6735 +               up_read(&con->sock_sem);
6736 +               goto accept_out;
6737 +       }
6738 +
6739 +       newsock->sk->sk_user_data = newcon;
6740 +       newcon->rx_action = receive_from_sock;
6741 +       add_sock(newsock, newcon);
6742 +       up_write(&newcon->sock_sem);
6743 +
6744 +       /*
6745 +        * Add it to the active queue in case we got data
6746 +        * beween processing the accept adding the socket
6747 +        * to the read_sockets list
6748 +        */
6749 +       lowcomms_data_ready(newsock->sk, 0);
6750 +
6751 +       up_read(&con->sock_sem);
6752 +
6753 +      accept_out:
6754 +       return 0;
6755 +
6756 +      accept_err:
6757 +       up_read(&con->sock_sem);
6758 +       sock_release(newsock);
6759 +
6760 +       printk("dlm: error accepting connection from node: %d\n", result);
6761 +       return result;
6762 +}
6763 +
6764 +/* Connect a new socket to its peer */
6765 +static int connect_to_sock(struct connection *con)
6766 +{
6767 +       int result = -EHOSTUNREACH;
6768 +       struct sockaddr_in6 saddr;
6769 +       int addr_len;
6770 +       struct socket *sock;
6771 +
6772 +       if (con->nodeid == 0) {
6773 +               log_print("attempt to connect sock 0 foiled");
6774 +               return 0;
6775 +       }
6776 +
6777 +       down_write(&con->sock_sem);
6778 +       if (con->retries++ > MAX_CONNECT_RETRIES)
6779 +               goto out;
6780 +
6781 +       // FIXME not sure this should happen, let alone like this.
6782 +       if (con->sock) {
6783 +               sock_release(con->sock);
6784 +               con->sock = NULL;
6785 +       }
6786 +
6787 +       /* Create a socket to communicate with */
6788 +       result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6789 +       if (result < 0)
6790 +               goto out_err;
6791 +
6792 +       if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
6793 +               goto out_err;
6794 +
6795 +       sock->sk->sk_user_data = con;
6796 +       con->rx_action = receive_from_sock;
6797 +
6798 +       make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
6799 +
6800 +       add_sock(sock, con);
6801 +       result =
6802 +           sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
6803 +                              O_NONBLOCK);
6804 +       if (result == -EINPROGRESS)
6805 +               result = 0;
6806 +       if (result != 0)
6807 +               goto out_err;
6808 +
6809 +      out:
6810 +       up_write(&con->sock_sem);
6811 +       /*
6812 +        * Returning an error here means we've given up trying to connect to
6813 +        * a remote node, otherwise we return 0 and reschedule the connetion
6814 +        * attempt
6815 +        */
6816 +       return result;
6817 +
6818 +      out_err:
6819 +       if (con->sock) {
6820 +               sock_release(con->sock);
6821 +               con->sock = NULL;
6822 +       }
6823 +       /*
6824 +        * Some errors are fatal and this list might need adjusting. For other
6825 +        * errors we try again until the max number of retries is reached.
6826 +        */
6827 +       if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
6828 +           result != -ENETDOWN && result != EINVAL
6829 +           && result != -EPROTONOSUPPORT) {
6830 +               lowcomms_connect_sock(con);
6831 +               result = 0;
6832 +       }
6833 +       goto out;
6834 +}
6835 +
6836 +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
6837 +{
6838 +        struct socket *sock = NULL;
6839 +       mm_segment_t fs;
6840 +       int result = 0;
6841 +       int one = 1;
6842 +       struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
6843 +
6844 +       /* Create a socket to communicate with */
6845 +       result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6846 +       if (result < 0) {
6847 +               printk("dlm: Can't create listening comms socket\n");
6848 +               goto create_out;
6849 +       }
6850 +
6851 +       fs = get_fs();
6852 +       set_fs(get_ds());
6853 +       result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
6854 +       set_fs(fs);
6855 +       if (result < 0) {
6856 +               printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
6857 +       }
6858 +       sock->sk->sk_user_data = con;
6859 +       con->rx_action = accept_from_sock;
6860 +       con->sock = sock;
6861 +
6862 +       /* Bind to our port */
6863 +       make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
6864 +       result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
6865 +       if (result < 0) {
6866 +               printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
6867 +               sock_release(sock);
6868 +               sock = NULL;
6869 +               goto create_out;
6870 +       }
6871 +
6872 +       fs = get_fs();
6873 +       set_fs(get_ds());
6874 +
6875 +       result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
6876 +       set_fs(fs);
6877 +       if (result < 0) {
6878 +               printk("dlm: Set keepalive failed: %d\n", result);
6879 +       }
6880 +
6881 +       result = sock->ops->listen(sock, 5);
6882 +       if (result < 0) {
6883 +               printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
6884 +               sock_release(sock);
6885 +               sock = NULL;
6886 +               goto create_out;
6887 +       }
6888 +
6889 +      create_out:
6890 +       return sock;
6891 +}
6892 +
6893 +
6894 +/* Listen on all interfaces */
6895 +static int listen_for_all(void)
6896 +{
6897 +       int result = 0;
6898 +       int nodeid;
6899 +       struct socket *sock = NULL;
6900 +       struct list_head *addr_list;
6901 +       struct connection *con = nodeid2con(0);
6902 +       struct cluster_node_addr *node_addr;
6903 +       char local_addr[sizeof(struct sockaddr_in6)];
6904 +
6905 +       /* This will also fill in local_addr */
6906 +       nodeid = lowcomms_our_nodeid();
6907 +
6908 +       addr_list = kcl_get_node_addresses(nodeid);
6909 +       if (!addr_list) {
6910 +               printk("dlm: cannot initialise comms layer\n");
6911 +               result = -ENOTCONN;
6912 +               goto create_out;
6913 +       }
6914 +
6915 +       list_for_each_entry(node_addr, addr_list, list) {
6916 +
6917 +               if (!con) {
6918 +                       con = kmalloc(sizeof(struct connection), GFP_KERNEL);
6919 +                       if (!con) {
6920 +                               printk("dlm: failed to allocate listen socket\n");
6921 +                               goto create_out;
6922 +                       }
6923 +                       memset(con, 0, sizeof(*con));
6924 +                       init_rwsem(&con->sock_sem);
6925 +                       spin_lock_init(&con->writequeue_lock);
6926 +                       INIT_LIST_HEAD(&con->writequeue);
6927 +                       set_bit(CF_IS_OTHERSOCK, &con->flags);
6928 +               }
6929 +
6930 +               memcpy(local_addr, node_addr->addr, node_addr->addr_len);
6931 +               sock = create_listen_sock(con, local_addr,
6932 +                                         node_addr->addr_len);
6933 +               if (sock) {
6934 +                       add_sock(sock, con);
6935 +               }
6936 +               else {
6937 +                       kfree(con);
6938 +               }
6939 +
6940 +               /* Keep a list of dynamically allocated listening sockets
6941 +                  so we can free them at shutdown */
6942 +               if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6943 +                       list_add_tail(&con->listenlist, &listen_sockets);
6944 +               }
6945 +               con = NULL;
6946 +       }
6947 +
6948 +      create_out:
6949 +       return result;
6950 +}
6951 +
6952 +
6953 +
6954 +static struct writequeue_entry *new_writequeue_entry(struct connection *con,
6955 +                                                    int allocation)
6956 +{
6957 +       struct writequeue_entry *entry;
6958 +
6959 +       entry = kmalloc(sizeof(struct writequeue_entry), allocation);
6960 +       if (!entry)
6961 +               return NULL;
6962 +
6963 +       entry->page = alloc_page(allocation);
6964 +       if (!entry->page) {
6965 +               kfree(entry);
6966 +               return NULL;
6967 +       }
6968 +
6969 +       entry->offset = 0;
6970 +       entry->len = 0;
6971 +       entry->end = 0;
6972 +       entry->users = 0;
6973 +       entry->con = con;
6974 +
6975 +       return entry;
6976 +}
6977 +
6978 +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
6979 +                                            int allocation, char **ppc)
6980 +{
6981 +       struct connection *con = nodeid2con(nodeid);
6982 +       struct writequeue_entry *e;
6983 +       int offset = 0;
6984 +       int users = 0;
6985 +
6986 +       if (!atomic_read(&accepting))
6987 +               return NULL;
6988 +
6989 +       spin_lock(&con->writequeue_lock);
6990 +       e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
6991 +       if (((struct list_head *) e == &con->writequeue) ||
6992 +           (PAGE_CACHE_SIZE - e->end < len)) {
6993 +               e = NULL;
6994 +       } else {
6995 +               offset = e->end;
6996 +               e->end += len;
6997 +               users = e->users++;
6998 +       }
6999 +       spin_unlock(&con->writequeue_lock);
7000 +
7001 +       if (e) {
7002 +             got_one:
7003 +               if (users == 0)
7004 +                       kmap(e->page);
7005 +               *ppc = page_address(e->page) + offset;
7006 +               return e;
7007 +       }
7008 +
7009 +       e = new_writequeue_entry(con, allocation);
7010 +       if (e) {
7011 +               spin_lock(&con->writequeue_lock);
7012 +               offset = e->end;
7013 +               e->end += len;
7014 +               users = e->users++;
7015 +               list_add_tail(&e->list, &con->writequeue);
7016 +               spin_unlock(&con->writequeue_lock);
7017 +               atomic_inc(&writequeue_length);
7018 +               goto got_one;
7019 +       }
7020 +       return NULL;
7021 +}
7022 +
7023 +void lowcomms_commit_buffer(struct writequeue_entry *e)
7024 +{
7025 +       struct connection *con = e->con;
7026 +       int users;
7027 +
7028 +       if (!atomic_read(&accepting))
7029 +               return;
7030 +
7031 +       spin_lock(&con->writequeue_lock);
7032 +       users = --e->users;
7033 +       if (users)
7034 +               goto out;
7035 +       e->len = e->end - e->offset;
7036 +       kunmap(e->page);
7037 +       spin_unlock(&con->writequeue_lock);
7038 +
7039 +       if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7040 +               spin_lock_bh(&write_sockets_lock);
7041 +               list_add_tail(&con->write_list, &write_sockets);
7042 +               spin_unlock_bh(&write_sockets_lock);
7043 +
7044 +               wake_up_interruptible(&lowcomms_send_waitq);
7045 +       }
7046 +       return;
7047 +
7048 +      out:
7049 +       spin_unlock(&con->writequeue_lock);
7050 +       return;
7051 +}
7052 +
7053 +static void free_entry(struct writequeue_entry *e)
7054 +{
7055 +       __free_page(e->page);
7056 +       kfree(e);
7057 +       atomic_dec(&writequeue_length);
7058 +}
7059 +
7060 +/* Send a message */
7061 +static int send_to_sock(struct connection *con)
7062 +{
7063 +       int ret = 0;
7064 +       ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7065 +       const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7066 +       struct writequeue_entry *e;
7067 +       int len, offset;
7068 +
7069 +       down_read(&con->sock_sem);
7070 +       if (con->sock == NULL)
7071 +               goto out_connect;
7072 +
7073 +       sendpage = con->sock->ops->sendpage;
7074 +
7075 +       spin_lock(&con->writequeue_lock);
7076 +       for (;;) {
7077 +               e = list_entry(con->writequeue.next, struct writequeue_entry,
7078 +                              list);
7079 +               if ((struct list_head *) e == &con->writequeue)
7080 +                       break;
7081 +
7082 +               len = e->len;
7083 +               offset = e->offset;
7084 +               BUG_ON(len == 0 && e->users == 0);
7085 +               spin_unlock(&con->writequeue_lock);
7086 +
7087 +               ret = 0;
7088 +               if (len) {
7089 +                       ret = sendpage(con->sock, e->page, offset, len,
7090 +                                      msg_flags);
7091 +                       if (ret == -EAGAIN || ret == 0)
7092 +                               goto out;
7093 +                       if (ret <= 0)
7094 +                               goto send_error;
7095 +               }
7096 +
7097 +               spin_lock(&con->writequeue_lock);
7098 +               e->offset += ret;
7099 +               e->len -= ret;
7100 +
7101 +               if (e->len == 0 && e->users == 0) {
7102 +                       list_del(&e->list);
7103 +                       free_entry(e);
7104 +                       continue;
7105 +               }
7106 +       }
7107 +       spin_unlock(&con->writequeue_lock);
7108 +      out:
7109 +       up_read(&con->sock_sem);
7110 +       return ret;
7111 +
7112 +      send_error:
7113 +       up_read(&con->sock_sem);
7114 +       close_connection(con);
7115 +       lowcomms_connect_sock(con);
7116 +       return ret;
7117 +
7118 +      out_connect:
7119 +       up_read(&con->sock_sem);
7120 +       lowcomms_connect_sock(con);
7121 +       return 0;
7122 +}
7123 +
7124 +/* Called from recoverd when it knows that a node has
7125 +   left the cluster */
7126 +int lowcomms_close(int nodeid)
7127 +{
7128 +       struct connection *con;
7129 +
7130 +       if (!connections)
7131 +               goto out;
7132 +
7133 +       con = nodeid2con(nodeid);
7134 +       if (con->sock) {
7135 +               close_connection(con);
7136 +               return 0;
7137 +       }
7138 +
7139 +      out:
7140 +       return -1;
7141 +}
7142 +
7143 +/* API send message call, may queue the request */
7144 +/* N.B. This is the old interface - use the new one for new calls */
7145 +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7146 +{
7147 +       struct writequeue_entry *e;
7148 +       char *b;
7149 +
7150 +       GDLM_ASSERT(nodeid < dlm_config.max_connections,
7151 +                   printk("nodeid=%u\n", nodeid););
7152 +
7153 +       e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7154 +       if (e) {
7155 +               memcpy(b, buf, len);
7156 +               lowcomms_commit_buffer(e);
7157 +               return 0;
7158 +       }
7159 +       return -ENOBUFS;
7160 +}
7161 +
7162 +/* Look for activity on active sockets */
7163 +static void process_sockets(void)
7164 +{
7165 +       struct list_head *list;
7166 +       struct list_head *temp;
7167 +
7168 +       spin_lock_bh(&read_sockets_lock);
7169 +       list_for_each_safe(list, temp, &read_sockets) {
7170 +               struct connection *con =
7171 +                   list_entry(list, struct connection, read_list);
7172 +               list_del(&con->read_list);
7173 +               clear_bit(CF_READ_PENDING, &con->flags);
7174 +
7175 +               spin_unlock_bh(&read_sockets_lock);
7176 +
7177 +               con->rx_action(con);
7178 +
7179 +               /* Don't starve out everyone else */
7180 +               schedule();
7181 +               spin_lock_bh(&read_sockets_lock);
7182 +       }
7183 +       spin_unlock_bh(&read_sockets_lock);
7184 +}
7185 +
7186 +/* Try to send any messages that are pending
7187 + */
7188 +static void process_output_queue(void)
7189 +{
7190 +       struct list_head *list;
7191 +       struct list_head *temp;
7192 +       int ret;
7193 +
7194 +       spin_lock_bh(&write_sockets_lock);
7195 +       list_for_each_safe(list, temp, &write_sockets) {
7196 +               struct connection *con =
7197 +                   list_entry(list, struct connection, write_list);
7198 +               list_del(&con->write_list);
7199 +               clear_bit(CF_WRITE_PENDING, &con->flags);
7200 +
7201 +               spin_unlock_bh(&write_sockets_lock);
7202 +
7203 +               ret = send_to_sock(con);
7204 +               if (ret < 0) {
7205 +               }
7206 +               spin_lock_bh(&write_sockets_lock);
7207 +       }
7208 +       spin_unlock_bh(&write_sockets_lock);
7209 +}
7210 +
7211 +static void process_state_queue(void)
7212 +{
7213 +       struct list_head *list;
7214 +       struct list_head *temp;
7215 +       int ret;
7216 +
7217 +       spin_lock_bh(&state_sockets_lock);
7218 +       list_for_each_safe(list, temp, &state_sockets) {
7219 +               struct connection *con =
7220 +                   list_entry(list, struct connection, state_list);
7221 +               list_del(&con->state_list);
7222 +               clear_bit(CF_CONNECT_PENDING, &con->flags);
7223 +               spin_unlock_bh(&state_sockets_lock);
7224 +
7225 +               ret = connect_to_sock(con);
7226 +               if (ret < 0) {
7227 +               }
7228 +               spin_lock_bh(&state_sockets_lock);
7229 +       }
7230 +       spin_unlock_bh(&state_sockets_lock);
7231 +}
7232 +
7233 +/* Discard all entries on the write queues */
7234 +static void clean_writequeues(void)
7235 +{
7236 +       struct list_head *list;
7237 +       struct list_head *temp;
7238 +       int nodeid;
7239 +
7240 +       for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
7241 +               struct connection *con = nodeid2con(nodeid);
7242 +
7243 +               spin_lock(&con->writequeue_lock);
7244 +               list_for_each_safe(list, temp, &con->writequeue) {
7245 +                       struct writequeue_entry *e =
7246 +                           list_entry(list, struct writequeue_entry, list);
7247 +                       list_del(&e->list);
7248 +                       free_entry(e);
7249 +               }
7250 +               spin_unlock(&con->writequeue_lock);
7251 +       }
7252 +}
7253 +
7254 +static int read_list_empty(void)
7255 +{
7256 +       int status;
7257 +
7258 +       spin_lock_bh(&read_sockets_lock);
7259 +       status = list_empty(&read_sockets);
7260 +       spin_unlock_bh(&read_sockets_lock);
7261 +
7262 +       return status;
7263 +}
7264 +
7265 +/* DLM Transport comms receive daemon */
7266 +static int dlm_recvd(void *data)
7267 +{
7268 +       daemonize("dlm_recvd");
7269 +       atomic_set(&recv_run, 1);
7270 +
7271 +       init_waitqueue_head(&lowcomms_recv_waitq);
7272 +       init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7273 +       add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7274 +
7275 +       complete(&thread_completion);
7276 +
7277 +       while (atomic_read(&recv_run)) {
7278 +
7279 +               set_task_state(current, TASK_INTERRUPTIBLE);
7280 +
7281 +               if (read_list_empty())
7282 +                       schedule();
7283 +
7284 +               set_task_state(current, TASK_RUNNING);
7285 +
7286 +               process_sockets();
7287 +       }
7288 +
7289 +       down(&thread_lock);
7290 +       up(&thread_lock);
7291 +
7292 +       complete(&thread_completion);
7293 +
7294 +       return 0;
7295 +}
7296 +
7297 +static int write_and_state_lists_empty(void)
7298 +{
7299 +       int status;
7300 +
7301 +       spin_lock_bh(&write_sockets_lock);
7302 +       status = list_empty(&write_sockets);
7303 +       spin_unlock_bh(&write_sockets_lock);
7304 +
7305 +       spin_lock_bh(&state_sockets_lock);
7306 +       if (list_empty(&state_sockets) == 0)
7307 +               status = 0;
7308 +       spin_unlock_bh(&state_sockets_lock);
7309 +
7310 +       return status;
7311 +}
7312 +
7313 +/* DLM Transport send daemon */
7314 +static int dlm_sendd(void *data)
7315 +{
7316 +       daemonize("dlm_sendd");
7317 +       atomic_set(&send_run, 1);
7318 +
7319 +       init_waitqueue_head(&lowcomms_send_waitq);
7320 +       init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7321 +       add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7322 +
7323 +       complete(&thread_completion);
7324 +
7325 +       while (atomic_read(&send_run)) {
7326 +
7327 +               set_task_state(current, TASK_INTERRUPTIBLE);
7328 +
7329 +               if (write_and_state_lists_empty())
7330 +                       schedule();
7331 +
7332 +               set_task_state(current, TASK_RUNNING);
7333 +
7334 +               process_state_queue();
7335 +               process_output_queue();
7336 +       }
7337 +
7338 +       down(&thread_lock);
7339 +       up(&thread_lock);
7340 +
7341 +       complete(&thread_completion);
7342 +
7343 +       return 0;
7344 +}
7345 +
7346 +static void daemons_stop(void)
7347 +{
7348 +       if (atomic_read(&recv_run)) {
7349 +               down(&thread_lock);
7350 +               atomic_set(&recv_run, 0);
7351 +               wake_up_interruptible(&lowcomms_recv_waitq);
7352 +               up(&thread_lock);
7353 +               wait_for_completion(&thread_completion);
7354 +       }
7355 +
7356 +       if (atomic_read(&send_run)) {
7357 +               down(&thread_lock);
7358 +               atomic_set(&send_run, 0);
7359 +               wake_up_interruptible(&lowcomms_send_waitq);
7360 +               up(&thread_lock);
7361 +               wait_for_completion(&thread_completion);
7362 +       }
7363 +}
7364 +
7365 +static int daemons_start(void)
7366 +{
7367 +       int error;
7368 +
7369 +       error = kernel_thread(dlm_recvd, NULL, 0);
7370 +       if (error < 0) {
7371 +               log_print("can't start recvd thread: %d", error);
7372 +               goto out;
7373 +       }
7374 +       wait_for_completion(&thread_completion);
7375 +
7376 +       error = kernel_thread(dlm_sendd, NULL, 0);
7377 +       if (error < 0) {
7378 +               log_print("can't start sendd thread: %d", error);
7379 +               daemons_stop();
7380 +               goto out;
7381 +       }
7382 +       wait_for_completion(&thread_completion);
7383 +
7384 +       error = 0;
7385 + out:
7386 +       return error;
7387 +}
7388 +
7389 +/*
7390 + * Return the largest buffer size we can cope with.
7391 + */
7392 +int lowcomms_max_buffer_size(void)
7393 +{
7394 +       return PAGE_CACHE_SIZE;
7395 +}
7396 +
7397 +void lowcomms_stop(void)
7398 +{
7399 +       int i;
7400 +       struct connection *temp;
7401 +       struct connection *lcon;
7402 +
7403 +       atomic_set(&accepting, 0);
7404 +
7405 +       /* Set all the activity flags to prevent any
7406 +          socket activity.
7407 +       */
7408 +       for (i = 0; i < conn_array_size; i++) {
7409 +               connections[i].flags = 0x7;
7410 +       }
7411 +       daemons_stop();
7412 +       clean_writequeues();
7413 +
7414 +       for (i = 0; i < conn_array_size; i++) {
7415 +               close_connection(nodeid2con(i));
7416 +       }
7417 +
7418 +       kfree(connections);
7419 +       connections = NULL;
7420 +
7421 +       /* Free up any dynamically allocated listening sockets */
7422 +       list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
7423 +               sock_release(lcon->sock);
7424 +               kfree(lcon);
7425 +       }
7426 +
7427 +       kcl_releaseref_cluster();
7428 +}
7429 +
7430 +/* This is quite likely to sleep... */
7431 +int lowcomms_start(void)
7432 +{
7433 +       int error = 0;
7434 +       int i;
7435 +
7436 +       INIT_LIST_HEAD(&read_sockets);
7437 +       INIT_LIST_HEAD(&write_sockets);
7438 +       INIT_LIST_HEAD(&state_sockets);
7439 +       INIT_LIST_HEAD(&listen_sockets);
7440 +
7441 +       spin_lock_init(&read_sockets_lock);
7442 +       spin_lock_init(&write_sockets_lock);
7443 +       spin_lock_init(&state_sockets_lock);
7444 +
7445 +       init_completion(&thread_completion);
7446 +       init_MUTEX(&thread_lock);
7447 +       atomic_set(&send_run, 0);
7448 +       atomic_set(&recv_run, 0);
7449 +
7450 +       error = -ENOTCONN;
7451 +       if (kcl_addref_cluster())
7452 +               goto out;
7453 +
7454 +       /*
7455 +        * Temporarily initialise the waitq head so that lowcomms_send_message
7456 +        * doesn't crash if it gets called before the thread is fully
7457 +        * initialised
7458 +        */
7459 +       init_waitqueue_head(&lowcomms_send_waitq);
7460 +
7461 +       error = -ENOMEM;
7462 +
7463 +       connections = kmalloc(sizeof(struct connection) *
7464 +                             dlm_config.max_connections, GFP_KERNEL);
7465 +       if (!connections)
7466 +               goto out;
7467 +
7468 +       memset(connections, 0,
7469 +              sizeof(struct connection) * dlm_config.max_connections);
7470 +       for (i = 0; i < dlm_config.max_connections; i++) {
7471 +               connections[i].nodeid = i;
7472 +               init_rwsem(&connections[i].sock_sem);
7473 +               INIT_LIST_HEAD(&connections[i].writequeue);
7474 +               spin_lock_init(&connections[i].writequeue_lock);
7475 +       }
7476 +       conn_array_size = dlm_config.max_connections;
7477 +
7478 +       /* Start listening */
7479 +       error = listen_for_all();
7480 +       if (error)
7481 +               goto fail_free_conn;
7482 +
7483 +       error = daemons_start();
7484 +       if (error)
7485 +               goto fail_free_conn;
7486 +
7487 +       atomic_set(&accepting, 1);
7488 +
7489 +       return 0;
7490 +
7491 +      fail_free_conn:
7492 +       kfree(connections);
7493 +
7494 +      out:
7495 +       return error;
7496 +}
7497 +
7498 +/* Don't accept any more outgoing work */
7499 +void lowcomms_stop_accept()
7500 +{
7501 +        atomic_set(&accepting, 0);
7502 +}
7503 +
7504 +/* Cluster Manager interface functions for looking up
7505 +   nodeids and IP addresses by each other
7506 +*/
7507 +
7508 +/* Return the IP address of a node given its NODEID */
7509 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
7510 +{
7511 +       struct list_head *addrs;
7512 +       struct cluster_node_addr *node_addr;
7513 +       struct cluster_node_addr *current_addr = NULL;
7514 +       struct sockaddr_in6 *saddr;
7515 +       int interface;
7516 +       int i;
7517 +
7518 +       addrs = kcl_get_node_addresses(nodeid);
7519 +       if (!addrs)
7520 +               return -1;
7521 +
7522 +       interface = kcl_get_current_interface();
7523 +
7524 +       /* Look for address number <interface> */
7525 +       i=0; /* i/f numbers start at 1 */
7526 +       list_for_each_entry(node_addr, addrs, list) {
7527 +               if (interface == ++i) {
7528 +                       current_addr = node_addr;
7529 +                       break;
7530 +               }
7531 +       }
7532 +
7533 +       /* If that failed then just use the first one */
7534 +       if (!current_addr)
7535 +               current_addr = (struct cluster_node_addr *)addrs->next;
7536 +
7537 +       saddr = (struct sockaddr_in6 *)current_addr->addr;
7538 +
7539 +       /* Extract the IP address */
7540 +       if (saddr->sin6_family == AF_INET) {
7541 +               struct sockaddr_in *in4  = (struct sockaddr_in *)saddr;
7542 +               struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
7543 +               ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
7544 +       }
7545 +       else {
7546 +               struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
7547 +               memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
7548 +       }
7549 +
7550 +       return 0;
7551 +}
7552 +
7553 +/* Return the NODEID for a node given its sockaddr */
7554 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
7555 +{
7556 +       struct kcl_cluster_node node;
7557 +       struct sockaddr_in6 ipv6_addr;
7558 +       struct sockaddr_in  ipv4_addr;
7559 +
7560 +       if (addr->sa_family == AF_INET) {
7561 +               struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
7562 +               memcpy(&ipv4_addr, &local_addr, addr_len);
7563 +               memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
7564 +
7565 +               addr = (struct sockaddr *)&ipv4_addr;
7566 +       }
7567 +       else {
7568 +               struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
7569 +               memcpy(&ipv6_addr, &local_addr, addr_len);
7570 +               memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
7571 +
7572 +               addr = (struct sockaddr *)&ipv6_addr;
7573 +       }
7574 +
7575 +       if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
7576 +               return node.node_id;
7577 +       else
7578 +               return 0;
7579 +}
7580 +
7581 +int lowcomms_our_nodeid(void)
7582 +{
7583 +       struct kcl_cluster_node node;
7584 +       struct list_head *addrs;
7585 +       struct cluster_node_addr *first_addr;
7586 +       static int our_nodeid = 0;
7587 +
7588 +       if (our_nodeid)
7589 +               return our_nodeid;
7590 +
7591 +       if (kcl_get_node_by_nodeid(0, &node) == -1)
7592 +               return 0;
7593 +
7594 +       our_nodeid = node.node_id;
7595 +
7596 +       /* Fill in the "template" structure */
7597 +       addrs = kcl_get_node_addresses(our_nodeid);
7598 +       if (!addrs)
7599 +               return 0;
7600 +
7601 +       first_addr = (struct cluster_node_addr *) addrs->next;
7602 +       memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
7603 +
7604 +       return node.node_id;
7605 +}
7606 +/*
7607 + * Overrides for Emacs so that we follow Linus's tabbing style.
7608 + * Emacs will notice this stuff at the end of the file and automatically
7609 + * adjust the settings for this buffer only.  This must remain at the end
7610 + * of the file.
7611 + * ---------------------------------------------------------------------------
7612 + * Local variables:
7613 + * c-file-style: "linux"
7614 + * End:
7615 + */
7616 diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
7617 --- linux-orig/cluster/dlm/lowcomms.h   1970-01-01 07:30:00.000000000 +0730
7618 +++ linux-patched/cluster/dlm/lowcomms.h        2004-06-25 18:31:07.000000000 +0800
7619 @@ -0,0 +1,34 @@
7620 +/******************************************************************************
7621 +*******************************************************************************
7622 +**
7623 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
7624 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
7625 +**
7626 +**  This copyrighted material is made available to anyone wishing to use,
7627 +**  modify, copy, or redistribute it subject to the terms and conditions
7628 +**  of the GNU General Public License v.2.
7629 +**
7630 +*******************************************************************************
7631 +******************************************************************************/
7632 +
7633 +#ifndef __LOWCOMMS_DOT_H__
7634 +#define __LOWCOMMS_DOT_H__
7635 +
7636 +/* The old interface */
7637 +int lowcomms_send_message(int csid, char *buf, int len, int allocation);
7638 +
7639 +/* The new interface */
7640 +struct writequeue_entry;
7641 +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7642 +                                                   int allocation, char **ppc);
7643 +extern void lowcomms_commit_buffer(struct writequeue_entry *e);
7644 +
7645 +int lowcomms_start(void);
7646 +void lowcomms_stop(void);
7647 +void lowcomms_stop_accept(void);
7648 +int lowcomms_close(int nodeid);
7649 +int lowcomms_max_buffer_size(void);
7650 +
7651 +int lowcomms_our_nodeid(void);
7652 +
7653 +#endif                         /* __LOWCOMMS_DOT_H__ */
7654 diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
7655 --- linux-orig/cluster/dlm/main.c       1970-01-01 07:30:00.000000000 +0730
7656 +++ linux-patched/cluster/dlm/main.c    2004-06-25 18:31:07.000000000 +0800
7657 @@ -0,0 +1,98 @@
7658 +/******************************************************************************
7659 +*******************************************************************************
7660 +**
7661 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
7662 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
7663 +**
7664 +**  This copyrighted material is made available to anyone wishing to use,
7665 +**  modify, copy, or redistribute it subject to the terms and conditions
7666 +**  of the GNU General Public License v.2.
7667 +**
7668 +*******************************************************************************
7669 +******************************************************************************/
7670 +
7671 +#define EXPORT_SYMTAB
7672 +
7673 +#include <linux/init.h>
7674 +#include <linux/proc_fs.h>
7675 +#include <linux/ctype.h>
7676 +#include <linux/seq_file.h>
7677 +#include <linux/module.h>
7678 +#include <net/sock.h>
7679 +
7680 +#include <cluster/cnxman.h>
7681 +
7682 +#include "dlm_internal.h"
7683 +#include "lockspace.h"
7684 +#include "recoverd.h"
7685 +#include "ast.h"
7686 +#include "lkb.h"
7687 +#include "nodes.h"
7688 +#include "locking.h"
7689 +#include "config.h"
7690 +#include "memory.h"
7691 +#include "recover.h"
7692 +#include "lowcomms.h"
7693 +
7694 +int  dlm_device_init(void);
7695 +void dlm_device_exit(void);
7696 +void dlm_proc_init(void);
7697 +void dlm_proc_exit(void);
7698 +
7699 +
7700 +/* Cluster manager callbacks, we want to know if a node dies
7701 +   N.B. this is independent of lockspace-specific event callbacks from SM */
7702 +
7703 +static void cman_callback(kcl_callback_reason reason, long arg)
7704 +{
7705 +       if (reason == DIED) {
7706 +               lowcomms_close((int) arg);
7707 +       }
7708 +
7709 +       /* This is unconditional. so do what we can to tidy up */
7710 +       if (reason == LEAVING) {
7711 +               dlm_emergency_shutdown();
7712 +       }
7713 +}
7714 +
7715 +int __init init_dlm(void)
7716 +{
7717 +       dlm_proc_init();
7718 +       dlm_lockspace_init();
7719 +       dlm_recoverd_init();
7720 +       dlm_nodes_init();
7721 +       dlm_device_init();
7722 +       dlm_memory_init();
7723 +       dlm_config_init();
7724 +
7725 +       kcl_add_callback(cman_callback);
7726 +
7727 +       printk("DLM %s (built %s %s) installed\n",
7728 +              DLM_RELEASE_NAME, __DATE__, __TIME__);
7729 +
7730 +       return 0;
7731 +}
7732 +
7733 +void __exit exit_dlm(void)
7734 +{
7735 +       kcl_remove_callback(cman_callback);
7736 +
7737 +       dlm_device_exit();
7738 +       dlm_memory_exit();
7739 +       dlm_config_exit();
7740 +       dlm_proc_exit();
7741 +}
7742 +
7743 +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
7744 +MODULE_AUTHOR("Red Hat, Inc.");
7745 +MODULE_LICENSE("GPL");
7746 +
7747 +module_init(init_dlm);
7748 +module_exit(exit_dlm);
7749 +
7750 +EXPORT_SYMBOL(dlm_init);
7751 +EXPORT_SYMBOL(dlm_release);
7752 +EXPORT_SYMBOL(dlm_new_lockspace);
7753 +EXPORT_SYMBOL(dlm_release_lockspace);
7754 +EXPORT_SYMBOL(dlm_lock);
7755 +EXPORT_SYMBOL(dlm_unlock);
7756 diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
7757 --- linux-orig/cluster/dlm/memory.c     1970-01-01 07:30:00.000000000 +0730
7758 +++ linux-patched/cluster/dlm/memory.c  2004-06-25 18:31:07.000000000 +0800
7759 @@ -0,0 +1,238 @@
7760 +/******************************************************************************
7761 +*******************************************************************************
7762 +**
7763 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
7764 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
7765 +**
7766 +**  This copyrighted material is made available to anyone wishing to use,
7767 +**  modify, copy, or redistribute it subject to the terms and conditions
7768 +**  of the GNU General Public License v.2.
7769 +**
7770 +*******************************************************************************
7771 +******************************************************************************/
7772 +
7773 +/* memory.c
7774 + *
7775 + * memory allocation routines
7776 + *
7777 + */
7778 +
7779 +#include "dlm_internal.h"
7780 +#include "memory.h"
7781 +#include "config.h"
7782 +
7783 +/* as the man says...Shouldn't this be in a header file somewhere? */
7784 +#define        BYTES_PER_WORD          sizeof(void *)
7785 +
7786 +static kmem_cache_t *rsb_cache_small;
7787 +static kmem_cache_t *rsb_cache_large;
7788 +static kmem_cache_t *lkb_cache;
7789 +static kmem_cache_t *lvb_cache;
7790 +static kmem_cache_t *resdir_cache_large;
7791 +static kmem_cache_t *resdir_cache_small;
7792 +
7793 +/* The thresholds above which we allocate large RSBs/resdatas rather than small
7794 + * ones. This must make the resultant structure end on a word boundary */
7795 +#define LARGE_RSB_NAME 28
7796 +#define LARGE_RES_NAME 28
7797 +
7798 +int dlm_memory_init()
7799 +{
7800 +       int ret = -ENOMEM;
7801 +
7802 +
7803 +       rsb_cache_small =
7804 +           kmem_cache_create("dlm_rsb(small)",
7805 +                             (sizeof(gd_res_t) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7806 +                             __alignof__(gd_res_t), 0, NULL, NULL);
7807 +       if (!rsb_cache_small)
7808 +               goto out;
7809 +
7810 +       rsb_cache_large =
7811 +           kmem_cache_create("dlm_rsb(large)",
7812 +                             sizeof(gd_res_t) + DLM_RESNAME_MAXLEN,
7813 +                             __alignof__(gd_res_t), 0, NULL, NULL);
7814 +       if (!rsb_cache_large)
7815 +               goto out_free_rsbs;
7816 +
7817 +       lkb_cache = kmem_cache_create("dlm_lkb", sizeof(gd_lkb_t),
7818 +                                     __alignof__(gd_lkb_t), 0, NULL, NULL);
7819 +       if (!lkb_cache)
7820 +               goto out_free_rsbl;
7821 +
7822 +       resdir_cache_large =
7823 +           kmem_cache_create("dlm_resdir(l)",
7824 +                             sizeof(gd_resdata_t) + DLM_RESNAME_MAXLEN,
7825 +                             __alignof__(gd_resdata_t), 0, NULL, NULL);
7826 +       if (!resdir_cache_large)
7827 +               goto out_free_lkb;
7828 +
7829 +       resdir_cache_small =
7830 +           kmem_cache_create("dlm_resdir(s)",
7831 +                             (sizeof(gd_resdata_t) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7832 +                             __alignof__(gd_resdata_t), 0, NULL, NULL);
7833 +       if (!resdir_cache_small)
7834 +               goto out_free_resl;
7835 +
7836 +       /* LVB cache also holds ranges, so should be 64bit aligned */
7837 +       lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
7838 +                                     __alignof__(uint64_t), 0, NULL, NULL);
7839 +       if (!lkb_cache)
7840 +               goto out_free_ress;
7841 +
7842 +       ret = 0;
7843 +       goto out;
7844 +
7845 +      out_free_ress:
7846 +       kmem_cache_destroy(resdir_cache_small);
7847 +
7848 +      out_free_resl:
7849 +       kmem_cache_destroy(resdir_cache_large);
7850 +
7851 +      out_free_lkb:
7852 +       kmem_cache_destroy(lkb_cache);
7853 +
7854 +      out_free_rsbl:
7855 +       kmem_cache_destroy(rsb_cache_large);
7856 +
7857 +      out_free_rsbs:
7858 +       kmem_cache_destroy(rsb_cache_small);
7859 +
7860 +      out:
7861 +       return ret;
7862 +}
7863 +
7864 +void dlm_memory_exit()
7865 +{
7866 +       kmem_cache_destroy(rsb_cache_large);
7867 +       kmem_cache_destroy(rsb_cache_small);
7868 +       kmem_cache_destroy(lkb_cache);
7869 +       kmem_cache_destroy(resdir_cache_small);
7870 +       kmem_cache_destroy(resdir_cache_large);
7871 +       kmem_cache_destroy(lvb_cache);
7872 +}
7873 +
7874 +gd_res_t *allocate_rsb(gd_ls_t *ls, int namelen)
7875 +{
7876 +       gd_res_t *r;
7877 +
7878 +       GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7879 +
7880 +       if (namelen >= LARGE_RSB_NAME)
7881 +               r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
7882 +       else
7883 +               r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
7884 +
7885 +       if (r)
7886 +               memset(r, 0, sizeof(gd_res_t) + namelen);
7887 +
7888 +       return r;
7889 +}
7890 +
7891 +void free_rsb(gd_res_t *r)
7892 +{
7893 +       int length = r->res_length;
7894 +
7895 +#ifdef POISON
7896 +       memset(r, 0x55, sizeof(gd_res_t) + r->res_length);
7897 +#endif
7898 +
7899 +       if (length >= LARGE_RSB_NAME)
7900 +               kmem_cache_free(rsb_cache_large, r);
7901 +       else
7902 +               kmem_cache_free(rsb_cache_small, r);
7903 +}
7904 +
7905 +gd_lkb_t *allocate_lkb(gd_ls_t *ls)
7906 +{
7907 +       gd_lkb_t *l;
7908 +
7909 +       l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
7910 +       if (l)
7911 +               memset(l, 0, sizeof(gd_lkb_t));
7912 +
7913 +       return l;
7914 +}
7915 +
7916 +void free_lkb(gd_lkb_t *l)
7917 +{
7918 +#ifdef POISON
7919 +       memset(l, 0xAA, sizeof(gd_lkb_t));
7920 +#endif
7921 +       kmem_cache_free(lkb_cache, l);
7922 +}
7923 +
7924 +gd_resdata_t *allocate_resdata(gd_ls_t *ls, int namelen)
7925 +{
7926 +       gd_resdata_t *rd;
7927 +
7928 +       GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7929 +
7930 +       if (namelen >= LARGE_RES_NAME)
7931 +               rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
7932 +       else
7933 +               rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
7934 +
7935 +       if (rd)
7936 +               memset(rd, 0, sizeof(gd_resdata_t));
7937 +
7938 +       return rd;
7939 +}
7940 +
7941 +void free_resdata(gd_resdata_t *rd)
7942 +{
7943 +       if (rd->rd_length >= LARGE_RES_NAME)
7944 +               kmem_cache_free(resdir_cache_large, rd);
7945 +       else
7946 +               kmem_cache_free(resdir_cache_small, rd);
7947 +}
7948 +
7949 +char *allocate_lvb(gd_ls_t *ls)
7950 +{
7951 +       char *l;
7952 +
7953 +       l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
7954 +       if (l)
7955 +               memset(l, 0, DLM_LVB_LEN);
7956 +
7957 +       return l;
7958 +}
7959 +
7960 +void free_lvb(char *l)
7961 +{
7962 +       kmem_cache_free(lvb_cache, l);
7963 +}
7964 +
7965 +/* Ranges are allocated from the LVB cache as they are the same size (4x64
7966 + * bits) */
7967 +uint64_t *allocate_range(gd_ls_t * ls)
7968 +{
7969 +       uint64_t *l;
7970 +
7971 +       l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
7972 +       if (l)
7973 +               memset(l, 0, DLM_LVB_LEN);
7974 +
7975 +       return l;
7976 +}
7977 +
7978 +void free_range(uint64_t *l)
7979 +{
7980 +       kmem_cache_free(lvb_cache, l);
7981 +}
7982 +
7983 +gd_rcom_t *allocate_rcom_buffer(gd_ls_t *ls)
7984 +{
7985 +       gd_rcom_t *rc;
7986 +
7987 +       rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
7988 +       if (rc)
7989 +               memset(rc, 0, dlm_config.buffer_size);
7990 +
7991 +       return rc;
7992 +}
7993 +
7994 +void free_rcom_buffer(gd_rcom_t *rc)
7995 +{
7996 +       kfree(rc);
7997 +}
7998 diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
7999 --- linux-orig/cluster/dlm/memory.h     1970-01-01 07:30:00.000000000 +0730
8000 +++ linux-patched/cluster/dlm/memory.h  2004-06-25 18:31:07.000000000 +0800
8001 @@ -0,0 +1,32 @@
8002 +/******************************************************************************
8003 +*******************************************************************************
8004 +**
8005 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8006 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8007 +**
8008 +**  This copyrighted material is made available to anyone wishing to use,
8009 +**  modify, copy, or redistribute it subject to the terms and conditions
8010 +**  of the GNU General Public License v.2.
8011 +**
8012 +*******************************************************************************
8013 +******************************************************************************/
8014 +
8015 +#ifndef __MEMORY_DOT_H__
8016 +#define __MEMORY_DOT_H__
8017 +
8018 +int dlm_memory_init(void);
8019 +void dlm_memory_exit(void);
8020 +gd_res_t *allocate_rsb(gd_ls_t * ls, int namelen);
8021 +void free_rsb(gd_res_t * r);
8022 +gd_lkb_t *allocate_lkb(gd_ls_t * ls);
8023 +void free_lkb(gd_lkb_t * l);
8024 +gd_resdata_t *allocate_resdata(gd_ls_t * ls, int namelen);
8025 +void free_resdata(gd_resdata_t * rd);
8026 +char *allocate_lvb(gd_ls_t * ls);
8027 +void free_lvb(char *l);
8028 +gd_rcom_t *allocate_rcom_buffer(gd_ls_t * ls);
8029 +void free_rcom_buffer(gd_rcom_t * rc);
8030 +uint64_t *allocate_range(gd_ls_t * ls);
8031 +void free_range(uint64_t * l);
8032 +
8033 +#endif         /* __MEMORY_DOT_H__ */
8034 diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8035 --- linux-orig/cluster/dlm/midcomms.c   1970-01-01 07:30:00.000000000 +0730
8036 +++ linux-patched/cluster/dlm/midcomms.c        2004-06-25 18:31:07.000000000 +0800
8037 @@ -0,0 +1,351 @@
8038 +/******************************************************************************
8039 +*******************************************************************************
8040 +**
8041 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8042 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8043 +**
8044 +**  This copyrighted material is made available to anyone wishing to use,
8045 +**  modify, copy, or redistribute it subject to the terms and conditions
8046 +**  of the GNU General Public License v.2.
8047 +**
8048 +*******************************************************************************
8049 +******************************************************************************/
8050 +
8051 +/*
8052 + * midcomms.c
8053 + *
8054 + * This is the appallingly named "mid-level" comms layer.
8055 + *
8056 + * Its purpose is to take packets from the "real" comms layer,
8057 + * split them up into packets and pass them to the interested
8058 + * part of the locking mechanism.
8059 + *
8060 + * It also takes messages from the locking layer, formats them
8061 + * into packets and sends them to the comms layer.
8062 + *
8063 + * It knows the format of the mid-level messages used and nodeidss
8064 + * but it does not know how to resolve a nodeid into an IP address
8065 + * or any of the comms channel details
8066 + *
8067 + */
8068 +
8069 +#include "dlm_internal.h"
8070 +#include "lowcomms.h"
8071 +#include "midcomms.h"
8072 +#include "lockqueue.h"
8073 +#include "nodes.h"
8074 +#include "reccomms.h"
8075 +#include "config.h"
8076 +
8077 +/* Byteorder routines */
8078 +
8079 +static void host_to_network(void *msg)
8080 +{
8081 +       struct gd_req_header *head = msg;
8082 +       struct gd_remlockrequest *req = msg;
8083 +       struct gd_remlockreply *reply = msg;
8084 +       struct gd_remquery *query = msg;
8085 +       struct gd_remqueryreply *queryrep = msg;
8086 +       gd_rcom_t *rc = msg;
8087 +
8088 +       /* Force into network byte order */
8089 +
8090 +       /*
8091 +        * Do the common header first
8092 +        */
8093 +
8094 +       head->rh_length = cpu_to_le16(head->rh_length);
8095 +       head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8096 +       /* Leave the lkid alone as it is transparent at the remote end */
8097 +
8098 +       /*
8099 +        * Do the fields in the remlockrequest or remlockreply structs
8100 +        */
8101 +
8102 +       switch (req->rr_header.rh_cmd) {
8103 +
8104 +       case GDLM_REMCMD_LOCKREQUEST:
8105 +       case GDLM_REMCMD_CONVREQUEST:
8106 +               req->rr_range_start = cpu_to_le64(req->rr_range_start);
8107 +               req->rr_range_end = cpu_to_le64(req->rr_range_end);
8108 +               /* Deliberate fall through */
8109 +       case GDLM_REMCMD_UNLOCKREQUEST:
8110 +       case GDLM_REMCMD_LOOKUP:
8111 +       case GDLM_REMCMD_LOCKGRANT:
8112 +       case GDLM_REMCMD_SENDBAST:
8113 +       case GDLM_REMCMD_SENDCAST:
8114 +       case GDLM_REMCMD_REM_RESDATA:
8115 +               req->rr_flags = cpu_to_le32(req->rr_flags);
8116 +               req->rr_status = cpu_to_le32(req->rr_status);
8117 +               break;
8118 +
8119 +       case GDLM_REMCMD_LOCKREPLY:
8120 +               reply->rl_lockstate = cpu_to_le32(reply->rl_lockstate);
8121 +               reply->rl_nodeid = cpu_to_le32(reply->rl_nodeid);
8122 +               reply->rl_status = cpu_to_le32(reply->rl_status);
8123 +               break;
8124 +
8125 +       case GDLM_REMCMD_RECOVERMESSAGE:
8126 +       case GDLM_REMCMD_RECOVERREPLY:
8127 +               rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8128 +               rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8129 +               break;
8130 +
8131 +       case GDLM_REMCMD_QUERY:
8132 +               query->rq_mstlkid = cpu_to_le32(query->rq_mstlkid);
8133 +               query->rq_query = cpu_to_le32(query->rq_query);
8134 +               query->rq_maxlocks = cpu_to_le32(query->rq_maxlocks);
8135 +               break;
8136 +
8137 +       case GDLM_REMCMD_QUERYREPLY:
8138 +               queryrep->rq_numlocks = cpu_to_le32(queryrep->rq_numlocks);
8139 +               queryrep->rq_status = cpu_to_le32(queryrep->rq_status);
8140 +               queryrep->rq_grantcount = cpu_to_le32(queryrep->rq_grantcount);
8141 +               queryrep->rq_waitcount = cpu_to_le32(queryrep->rq_waitcount);
8142 +               queryrep->rq_convcount = cpu_to_le32(queryrep->rq_convcount);
8143 +               break;
8144 +
8145 +       default:
8146 +               printk("dlm: warning, unknown REMCMD type %u\n",
8147 +                      req->rr_header.rh_cmd);
8148 +       }
8149 +}
8150 +
8151 +static void network_to_host(void *msg)
8152 +{
8153 +       struct gd_req_header *head = msg;
8154 +       struct gd_remlockrequest *req = msg;
8155 +       struct gd_remlockreply *reply = msg;
8156 +       struct gd_remquery *query = msg;
8157 +       struct gd_remqueryreply *queryrep = msg;
8158 +       gd_rcom_t *rc = msg;
8159 +
8160 +       /* Force into host byte order */
8161 +
8162 +       /*
8163 +        * Do the common header first
8164 +        */
8165 +
8166 +       head->rh_length = le16_to_cpu(head->rh_length);
8167 +       head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8168 +       /* Leave the lkid alone as it is transparent at the remote end */
8169 +
8170 +       /*
8171 +        * Do the fields in the remlockrequest or remlockreply structs
8172 +        */
8173 +
8174 +       switch (req->rr_header.rh_cmd) {
8175 +
8176 +       case GDLM_REMCMD_LOCKREQUEST:
8177 +       case GDLM_REMCMD_CONVREQUEST:
8178 +               req->rr_range_start = le64_to_cpu(req->rr_range_start);
8179 +               req->rr_range_end = le64_to_cpu(req->rr_range_end);
8180 +       case GDLM_REMCMD_LOOKUP:
8181 +       case GDLM_REMCMD_UNLOCKREQUEST:
8182 +       case GDLM_REMCMD_LOCKGRANT:
8183 +       case GDLM_REMCMD_SENDBAST:
8184 +       case GDLM_REMCMD_SENDCAST:
8185 +       case GDLM_REMCMD_REM_RESDATA:
8186 +               /* Actually, not much to do here as the remote lock IDs are
8187 +                * transparent too */
8188 +               req->rr_flags = le32_to_cpu(req->rr_flags);
8189 +               req->rr_status = le32_to_cpu(req->rr_status);
8190 +               break;
8191 +
8192 +       case GDLM_REMCMD_LOCKREPLY:
8193 +               reply->rl_lockstate = le32_to_cpu(reply->rl_lockstate);
8194 +               reply->rl_nodeid = le32_to_cpu(reply->rl_nodeid);
8195 +               reply->rl_status = le32_to_cpu(reply->rl_status);
8196 +               break;
8197 +
8198 +       case GDLM_REMCMD_RECOVERMESSAGE:
8199 +       case GDLM_REMCMD_RECOVERREPLY:
8200 +               rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8201 +               rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8202 +               break;
8203 +
8204 +
8205 +       case GDLM_REMCMD_QUERY:
8206 +               query->rq_mstlkid = le32_to_cpu(query->rq_mstlkid);
8207 +               query->rq_query = le32_to_cpu(query->rq_query);
8208 +               query->rq_maxlocks = le32_to_cpu(query->rq_maxlocks);
8209 +               break;
8210 +
8211 +       case GDLM_REMCMD_QUERYREPLY:
8212 +               queryrep->rq_numlocks = le32_to_cpu(queryrep->rq_numlocks);
8213 +               queryrep->rq_status = le32_to_cpu(queryrep->rq_status);
8214 +               queryrep->rq_grantcount = le32_to_cpu(queryrep->rq_grantcount);
8215 +               queryrep->rq_waitcount = le32_to_cpu(queryrep->rq_waitcount);
8216 +               queryrep->rq_convcount = le32_to_cpu(queryrep->rq_convcount);
8217 +               break;
8218 +
8219 +       default:
8220 +               printk("dlm: warning, unknown REMCMD type %u\n",
8221 +                      req->rr_header.rh_cmd);
8222 +       }
8223 +}
8224 +
8225 +static void copy_from_cb(void *dst, const void *base, unsigned offset,
8226 +                        unsigned len, unsigned limit)
8227 +{
8228 +       unsigned copy = len;
8229 +
8230 +       if ((copy + offset) > limit)
8231 +               copy = limit - offset;
8232 +       memcpy(dst, base + offset, copy);
8233 +       len -= copy;
8234 +       if (len)
8235 +               memcpy(dst + copy, base, len);
8236 +}
8237 +
8238 +static void khexdump(const unsigned char *c, int len)
8239 +{
8240 +       while (len > 16) {
8241 +               printk(KERN_INFO
8242 +                      "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8243 +                      c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8244 +                      c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8245 +               len -= 16;
8246 +       }
8247 +       while (len > 4) {
8248 +               printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8249 +                      c[3]);
8250 +               len -= 4;
8251 +       }
8252 +       while (len > 0) {
8253 +               printk(KERN_INFO "%02x\n", c[0]);
8254 +               len--;
8255 +       }
8256 +}
8257 +
8258 +/*
8259 + * Called from the low-level comms layer to process a buffer of
8260 + * commands.
8261 + *
8262 + * Only complete messages are processed here, any "spare" bytes from
8263 + * the end of a buffer are saved and tacked onto the front of the next
8264 + * message that comes in. I doubt this will happen very often but we
8265 + * need to be able to cope with it and I don't want the task to be waiting
8266 + * for packets to come in when there is useful work to be done.
8267 + *
8268 + */
8269 +int midcomms_process_incoming_buffer(int nodeid, const void *base,
8270 +                                    unsigned offset, unsigned len,
8271 +                                    unsigned limit)
8272 +{
8273 +       unsigned char __tmp[sizeof(struct gd_req_header) + 64];
8274 +       struct gd_req_header *msg = (struct gd_req_header *) __tmp;
8275 +       int ret = 0;
8276 +       int err = 0;
8277 +       unsigned msglen;
8278 +       __u32 id, space;
8279 +
8280 +       while (len > sizeof(struct gd_req_header)) {
8281 +               /* Get message header and check it over */
8282 +               copy_from_cb(msg, base, offset, sizeof(struct gd_req_header),
8283 +                            limit);
8284 +               msglen = le16_to_cpu(msg->rh_length);
8285 +               id = msg->rh_lkid;
8286 +               space = msg->rh_lockspace;
8287 +
8288 +               /* Check message size */
8289 +               err = -EINVAL;
8290 +               if (msglen < sizeof(struct gd_req_header))
8291 +                       break;
8292 +               err = -E2BIG;
8293 +               if (msglen > dlm_config.buffer_size) {
8294 +                       printk("dlm: message size too big %d\n", msglen);
8295 +                       break;
8296 +               }
8297 +               err = 0;
8298 +
8299 +               /* Not enough in buffer yet? wait for some more */
8300 +               if (msglen > len)
8301 +                       break;
8302 +
8303 +               /* Make sure our temp buffer is large enough */
8304 +               if (msglen > sizeof(__tmp) &&
8305 +                   msg == (struct gd_req_header *) __tmp) {
8306 +                       msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8307 +                       if (msg == NULL)
8308 +                               return ret;
8309 +               }
8310 +
8311 +               copy_from_cb(msg, base, offset, msglen, limit);
8312 +               BUG_ON(id != msg->rh_lkid);
8313 +               BUG_ON(space != msg->rh_lockspace);
8314 +               ret += msglen;
8315 +               offset += msglen;
8316 +               offset &= (limit - 1);
8317 +               len -= msglen;
8318 +               network_to_host(msg);
8319 +
8320 +               if ((msg->rh_cmd > 32) ||
8321 +                   (msg->rh_cmd == 0) ||
8322 +                   (msg->rh_length < sizeof(struct gd_req_header)) ||
8323 +                   (msg->rh_length > dlm_config.buffer_size)) {
8324 +
8325 +                       printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8326 +                              "lkid=%u, lockspace=%u\n",
8327 +                              msg->rh_cmd, msg->rh_flags, msg->rh_length,
8328 +                              msg->rh_lkid, msg->rh_lockspace);
8329 +
8330 +                       printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8331 +                              "ret=%u, limit=%08x newbuf=%d\n",
8332 +                              base, offset, len, ret, limit,
8333 +                              ((struct gd_req_header *) __tmp == msg));
8334 +
8335 +                       khexdump((const unsigned char *) msg, msg->rh_length);
8336 +
8337 +                       return -EBADMSG;
8338 +               }
8339 +
8340 +               switch (msg->rh_cmd) {
8341 +               case GDLM_REMCMD_RECOVERMESSAGE:
8342 +               case GDLM_REMCMD_RECOVERREPLY:
8343 +                       process_recovery_comm(nodeid, msg);
8344 +                       break;
8345 +               default:
8346 +                       process_cluster_request(nodeid, msg, FALSE);
8347 +               }
8348 +       }
8349 +
8350 +       if (msg != (struct gd_req_header *) __tmp)
8351 +               kfree(msg);
8352 +
8353 +       return err ? err : ret;
8354 +}
8355 +
8356 +/*
8357 + * Send a lowcomms buffer
8358 + */
8359 +
8360 +void midcomms_send_buffer(struct gd_req_header *msg, struct writequeue_entry *e)
8361 +{
8362 +       host_to_network(msg);
8363 +       lowcomms_commit_buffer(e);
8364 +}
8365 +
8366 +/*
8367 + * Make the message into network byte order and send it
8368 + */
8369 +
8370 +int midcomms_send_message(uint32_t nodeid, struct gd_req_header *msg,
8371 +                         int allocation)
8372 +{
8373 +       int len = msg->rh_length;
8374 +
8375 +       host_to_network(msg);
8376 +
8377 +       /*
8378 +        * Loopback.  In fact, the locking code pretty much prevents this from
8379 +        * being needed but it can happen when the directory node is also the
8380 +        * local node.
8381 +        */
8382 +
8383 +       if (nodeid == our_nodeid())
8384 +               return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
8385 +                                                       len, len);
8386 +
8387 +       return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
8388 +}
8389 diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
8390 --- linux-orig/cluster/dlm/midcomms.h   1970-01-01 07:30:00.000000000 +0730
8391 +++ linux-patched/cluster/dlm/midcomms.h        2004-06-25 18:31:07.000000000 +0800
8392 @@ -0,0 +1,24 @@
8393 +/******************************************************************************
8394 +*******************************************************************************
8395 +**
8396 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8397 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8398 +**
8399 +**  This copyrighted material is made available to anyone wishing to use,
8400 +**  modify, copy, or redistribute it subject to the terms and conditions
8401 +**  of the GNU General Public License v.2.
8402 +**
8403 +*******************************************************************************
8404 +******************************************************************************/
8405 +
8406 +#ifndef __MIDCOMMS_DOT_H__
8407 +#define __MIDCOMMS_DOT_H__
8408 +
8409 +int midcomms_send_message(uint32_t csid, struct gd_req_header *msg,
8410 +                         int allocation);
8411 +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
8412 +                                    unsigned len, unsigned limit);
8413 +void midcomms_send_buffer(struct gd_req_header *msg,
8414 +                         struct writequeue_entry *e);
8415 +
8416 +#endif                         /* __MIDCOMMS_DOT_H__ */
8417 diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
8418 --- linux-orig/cluster/dlm/nodes.c      1970-01-01 07:30:00.000000000 +0730
8419 +++ linux-patched/cluster/dlm/nodes.c   2004-06-25 18:31:07.000000000 +0800
8420 @@ -0,0 +1,325 @@
8421 +/******************************************************************************
8422 +*******************************************************************************
8423 +**
8424 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8425 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8426 +**
8427 +**  This copyrighted material is made available to anyone wishing to use,
8428 +**  modify, copy, or redistribute it subject to the terms and conditions
8429 +**  of the GNU General Public License v.2.
8430 +**
8431 +*******************************************************************************
8432 +******************************************************************************/
8433 +
8434 +#include <net/sock.h>
8435 +#include <cluster/cnxman.h>
8436 +
8437 +#include "dlm_internal.h"
8438 +#include "lowcomms.h"
8439 +#include "nodes.h"
8440 +#include "recover.h"
8441 +#include "reccomms.h"
8442 +#include "util.h"
8443 +
8444 +static struct list_head cluster_nodes;
8445 +static spinlock_t node_lock;
8446 +static uint32_t local_nodeid;
8447 +static struct semaphore local_init_lock;
8448 +
8449 +
8450 +void dlm_nodes_init(void)
8451 +{
8452 +       INIT_LIST_HEAD(&cluster_nodes);
8453 +       spin_lock_init(&node_lock);
8454 +       local_nodeid = 0;
8455 +       init_MUTEX(&local_init_lock);
8456 +}
8457 +
8458 +static gd_node_t *search_node(uint32_t nodeid)
8459 +{
8460 +       gd_node_t *node;
8461 +
8462 +       list_for_each_entry(node, &cluster_nodes, gn_list) {
8463 +               if (node->gn_nodeid == nodeid)
8464 +                       goto out;
8465 +       }
8466 +       node = NULL;
8467 +      out:
8468 +       return node;
8469 +}
8470 +
8471 +static void put_node(gd_node_t *node)
8472 +{
8473 +       spin_lock(&node_lock);
8474 +       node->gn_refcount--;
8475 +       if (node->gn_refcount == 0) {
8476 +               list_del(&node->gn_list);
8477 +               spin_unlock(&node_lock);
8478 +               kfree(node);
8479 +               return;
8480 +       }
8481 +       spin_unlock(&node_lock);
8482 +}
8483 +
8484 +static int get_node(uint32_t nodeid, gd_node_t **ndp)
8485 +{
8486 +       gd_node_t *node, *node2;
8487 +       int error = -ENOMEM;
8488 +
8489 +       spin_lock(&node_lock);
8490 +       node = search_node(nodeid);
8491 +       if (node)
8492 +               node->gn_refcount++;
8493 +       spin_unlock(&node_lock);
8494 +
8495 +       if (node)
8496 +               goto out;
8497 +
8498 +       node = (gd_node_t *) kmalloc(sizeof(gd_node_t), GFP_KERNEL);
8499 +       if (!node)
8500 +               goto fail;
8501 +
8502 +       memset(node, 0, sizeof(gd_node_t));
8503 +       node->gn_nodeid = nodeid;
8504 +
8505 +       spin_lock(&node_lock);
8506 +       node2 = search_node(nodeid);
8507 +       if (node2) {
8508 +               node2->gn_refcount++;
8509 +               spin_unlock(&node_lock);
8510 +               kfree(node);
8511 +               node = node2;
8512 +               goto out;
8513 +       }
8514 +
8515 +       node->gn_refcount = 1;
8516 +       list_add_tail(&node->gn_list, &cluster_nodes);
8517 +       spin_unlock(&node_lock);
8518 +
8519 +      out:
8520 +       *ndp = node;
8521 +       return 0;
8522 +
8523 +      fail:
8524 +       return error;
8525 +}
8526 +
8527 +int init_new_csb(uint32_t nodeid, gd_csb_t **ret_csb)
8528 +{
8529 +       gd_csb_t *csb;
8530 +       gd_node_t *node;
8531 +       int error = -ENOMEM;
8532 +
8533 +       csb = (gd_csb_t *) kmalloc(sizeof(gd_csb_t), GFP_KERNEL);
8534 +       if (!csb)
8535 +               goto fail;
8536 +
8537 +       memset(csb, 0, sizeof(gd_csb_t));
8538 +
8539 +       error = get_node(nodeid, &node);
8540 +       if (error)
8541 +               goto fail_free;
8542 +
8543 +       csb->csb_node = node;
8544 +
8545 +       down(&local_init_lock);
8546 +
8547 +       if (!local_nodeid) {
8548 +               if (nodeid == our_nodeid()) {
8549 +                       local_nodeid = node->gn_nodeid;
8550 +               }
8551 +       }
8552 +       up(&local_init_lock);
8553 +
8554 +       *ret_csb = csb;
8555 +       return 0;
8556 +
8557 +      fail_free:
8558 +       kfree(csb);
8559 +      fail:
8560 +       return error;
8561 +}
8562 +
8563 +void release_csb(gd_csb_t *csb)
8564 +{
8565 +       put_node(csb->csb_node);
8566 +       kfree(csb);
8567 +}
8568 +
8569 +uint32_t our_nodeid(void)
8570 +{
8571 +       return lowcomms_our_nodeid();
8572 +}
8573 +
8574 +int nodes_reconfig_wait(gd_ls_t *ls)
8575 +{
8576 +       int error;
8577 +
8578 +       if (ls->ls_low_nodeid == our_nodeid()) {
8579 +               error = gdlm_wait_status_all(ls, NODES_VALID);
8580 +               if (!error)
8581 +                       set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
8582 +
8583 +               /* Experimental: this delay should allow any final messages
8584 +                * from the previous node to be received before beginning
8585 +                * recovery. */
8586 +
8587 +               if (ls->ls_num_nodes == 1) {
8588 +                       current->state = TASK_UNINTERRUPTIBLE;
8589 +                       schedule_timeout((2) * HZ);
8590 +               }
8591 +
8592 +       } else
8593 +               error = gdlm_wait_status_low(ls, NODES_ALL_VALID);
8594 +
8595 +       return error;
8596 +}
8597 +
8598 +static void add_ordered_node(gd_ls_t *ls, gd_csb_t *new)
8599 +{
8600 +       gd_csb_t *csb = NULL;
8601 +       struct list_head *tmp;
8602 +       struct list_head *newlist = &new->csb_list;
8603 +       struct list_head *head = &ls->ls_nodes;
8604 +
8605 +       list_for_each(tmp, head) {
8606 +               csb = list_entry(tmp, gd_csb_t, csb_list);
8607 +
8608 +               if (new->csb_node->gn_nodeid < csb->csb_node->gn_nodeid)
8609 +                       break;
8610 +       }
8611 +
8612 +       if (!csb)
8613 +               list_add_tail(newlist, head);
8614 +       else {
8615 +               /* FIXME: can use list macro here */
8616 +               newlist->prev = tmp->prev;
8617 +               newlist->next = tmp;
8618 +               tmp->prev->next = newlist;
8619 +               tmp->prev = newlist;
8620 +       }
8621 +}
8622 +
8623 +int ls_nodes_reconfig(gd_ls_t *ls, gd_recover_t *gr, int *neg_out)
8624 +{
8625 +       gd_csb_t *csb, *safe;
8626 +       int error, i, found, pos = 0, neg = 0;
8627 +       uint32_t low = (uint32_t) (-1);
8628 +
8629 +       /*
8630 +        * Remove (and save) departed nodes from lockspace's nodes list
8631 +        */
8632 +
8633 +       list_for_each_entry_safe(csb, safe, &ls->ls_nodes, csb_list) {
8634 +               found = FALSE;
8635 +               for (i = 0; i < gr->gr_node_count; i++) {
8636 +                       if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
8637 +                               found = TRUE;
8638 +                               break;
8639 +                       }
8640 +               }
8641 +
8642 +               if (!found) {
8643 +                       neg++;
8644 +                       csb->csb_gone_event = gr->gr_event_id;
8645 +                       list_del(&csb->csb_list);
8646 +                       list_add_tail(&csb->csb_list, &ls->ls_nodes_gone);
8647 +                       ls->ls_num_nodes--;
8648 +                       log_all(ls, "remove node %u", csb->csb_node->gn_nodeid);
8649 +               }
8650 +       }
8651 +
8652 +       /*
8653 +        * Add new nodes to lockspace's nodes list
8654 +        */
8655 +
8656 +       for (i = 0; i < gr->gr_node_count; i++) {
8657 +               found = FALSE;
8658 +               list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
8659 +                       if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
8660 +                               found = TRUE;
8661 +                               break;
8662 +                       }
8663 +               }
8664 +
8665 +               if (!found) {
8666 +                       pos++;
8667 +
8668 +                       error = init_new_csb(gr->gr_nodeids[i], &csb);
8669 +                       GDLM_ASSERT(!error,);
8670 +
8671 +                       add_ordered_node(ls, csb);
8672 +                       ls->ls_num_nodes++;
8673 +                       log_all(ls, "add node %u", csb->csb_node->gn_nodeid);
8674 +               }
8675 +       }
8676 +
8677 +       list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
8678 +               if (csb->csb_node->gn_nodeid < low)
8679 +                       low = csb->csb_node->gn_nodeid;
8680 +       }
8681 +
8682 +       rcom_log_clear(ls);
8683 +       ls->ls_low_nodeid = low;
8684 +       ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
8685 +       set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8686 +       *neg_out = neg;
8687 +
8688 +       error = nodes_reconfig_wait(ls);
8689 +
8690 +       log_all(ls, "total nodes %d", ls->ls_num_nodes);
8691 +
8692 +       return error;
8693 +}
8694 +
8695 +int ls_nodes_init(gd_ls_t *ls, gd_recover_t *gr)
8696 +{
8697 +       gd_csb_t *csb;
8698 +       int i, error;
8699 +       uint32_t low = (uint32_t) (-1);
8700 +
8701 +       log_all(ls, "add nodes");
8702 +
8703 +       for (i = 0; i < gr->gr_node_count; i++) {
8704 +               error = init_new_csb(gr->gr_nodeids[i], &csb);
8705 +               if (error)
8706 +                       goto fail;
8707 +
8708 +               add_ordered_node(ls, csb);
8709 +               ls->ls_num_nodes++;
8710 +
8711 +               if (csb->csb_node->gn_nodeid < low)
8712 +                       low = csb->csb_node->gn_nodeid;
8713 +       }
8714 +
8715 +       ls->ls_low_nodeid = low;
8716 +       ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
8717 +       set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8718 +
8719 +       error = nodes_reconfig_wait(ls);
8720 +
8721 +       log_all(ls, "total nodes %d", ls->ls_num_nodes);
8722 +
8723 +       return error;
8724 +
8725 +      fail:
8726 +       while (!list_empty(&ls->ls_nodes)) {
8727 +               csb = list_entry(ls->ls_nodes.next, gd_csb_t, csb_list);
8728 +               list_del(&csb->csb_list);
8729 +               release_csb(csb);
8730 +       }
8731 +       ls->ls_num_nodes = 0;
8732 +
8733 +       return error;
8734 +}
8735 +
8736 +int in_nodes_gone(gd_ls_t *ls, uint32_t nodeid)
8737 +{
8738 +       gd_csb_t *csb;
8739 +
8740 +       list_for_each_entry(csb, &ls->ls_nodes_gone, csb_list) {
8741 +               if (csb->csb_node->gn_nodeid == nodeid)
8742 +                       return TRUE;
8743 +       }
8744 +       return FALSE;
8745 +}
8746 diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
8747 --- linux-orig/cluster/dlm/nodes.h      1970-01-01 07:30:00.000000000 +0730
8748 +++ linux-patched/cluster/dlm/nodes.h   2004-06-25 18:31:07.000000000 +0800
8749 @@ -0,0 +1,25 @@
8750 +/******************************************************************************
8751 +*******************************************************************************
8752 +**
8753 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8754 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8755 +**
8756 +**  This copyrighted material is made available to anyone wishing to use,
8757 +**  modify, copy, or redistribute it subject to the terms and conditions
8758 +**  of the GNU General Public License v.2.
8759 +**
8760 +*******************************************************************************
8761 +******************************************************************************/
8762 +
8763 +#ifndef __NODES_DOT_H__
8764 +#define __NODES_DOT_H__
8765 +
8766 +void dlm_nodes_init(void);
8767 +int init_new_csb(uint32_t nodeid, gd_csb_t ** ret_csb);
8768 +void release_csb(gd_csb_t * csb);
8769 +uint32_t our_nodeid(void);
8770 +int ls_nodes_reconfig(gd_ls_t * ls, gd_recover_t * gr, int *neg);
8771 +int ls_nodes_init(gd_ls_t * ls, gd_recover_t * gr);
8772 +int in_nodes_gone(gd_ls_t * ls, uint32_t nodeid);
8773 +
8774 +#endif                         /* __NODES_DOT_H__ */
8775 diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
8776 --- linux-orig/cluster/dlm/proc.c       1970-01-01 07:30:00.000000000 +0730
8777 +++ linux-patched/cluster/dlm/proc.c    2004-06-25 18:31:07.000000000 +0800
8778 @@ -0,0 +1,469 @@
8779 +/******************************************************************************
8780 +*******************************************************************************
8781 +**
8782 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8783 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8784 +**
8785 +**  This copyrighted material is made available to anyone wishing to use,
8786 +**  modify, copy, or redistribute it subject to the terms and conditions
8787 +**  of the GNU General Public License v.2.
8788 +**
8789 +*******************************************************************************
8790 +******************************************************************************/
8791 +
8792 +#include <linux/init.h>
8793 +#include <linux/proc_fs.h>
8794 +#include <linux/ctype.h>
8795 +#include <linux/seq_file.h>
8796 +#include <linux/module.h>
8797 +
8798 +#include "dlm_internal.h"
8799 +#include "lockspace.h"
8800 +
8801 +#if defined(DLM_DEBUG)
8802 +#define DLM_DEBUG_SIZE         (1024)
8803 +#define MAX_DEBUG_MSG_LEN      (64)
8804 +#else
8805 +#define DLM_DEBUG_SIZE         (0)
8806 +#define MAX_DEBUG_MSG_LEN      (0)
8807 +#endif
8808 +
8809 +static char *                  debug_buf;
8810 +static unsigned int            debug_size;
8811 +static unsigned int            debug_point;
8812 +static int                     debug_wrap;
8813 +static spinlock_t              debug_lock;
8814 +static struct proc_dir_entry * debug_proc_entry = NULL;
8815 +static struct proc_dir_entry * rcom_proc_entry = NULL;
8816 +static char                    proc_ls_name[255] = "";
8817 +
8818 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
8819 +static struct proc_dir_entry * locks_proc_entry = NULL;
8820 +static struct seq_operations   locks_info_op;
8821 +
8822 +
8823 +static int locks_open(struct inode *inode, struct file *file)
8824 +{
8825 +       return seq_open(file, &locks_info_op);
8826 +}
8827 +
8828 +/* Write simply sets the lockspace to use */
8829 +static ssize_t locks_write(struct file *file, const char *buf,
8830 +                          size_t count, loff_t * ppos)
8831 +{
8832 +       if (count < sizeof(proc_ls_name)) {
8833 +               copy_from_user(proc_ls_name, buf, count);
8834 +               proc_ls_name[count] = '\0';
8835 +
8836 +               /* Remove any trailing LF so that lazy users
8837 +                  can just echo "lsname" > /proc/cluster/dlm_locks */
8838 +               if (proc_ls_name[count - 1] == '\n')
8839 +                       proc_ls_name[count - 1] = '\0';
8840 +
8841 +               return count;
8842 +       }
8843 +       return 0;
8844 +}
8845 +
8846 +static struct file_operations locks_fops = {
8847 +      open:locks_open,
8848 +      write:locks_write,
8849 +      read:seq_read,
8850 +      llseek:seq_lseek,
8851 +      release:seq_release,
8852 +};
8853 +
8854 +struct ls_dumpinfo {
8855 +       int entry;
8856 +       struct list_head *next;
8857 +       gd_ls_t *ls;
8858 +       gd_res_t *rsb;
8859 +};
8860 +
8861 +static int print_resource(gd_res_t * res, struct seq_file *s);
8862 +
8863 +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
8864 +{
8865 +       read_lock(&di->ls->ls_reshash_lock);
8866 +       if (!di->next) {
8867 +               /* Find the next non-empty hash bucket */
8868 +               while (list_empty(&di->ls->ls_reshashtbl[di->entry]) &&
8869 +                      di->entry < di->ls->ls_hashsize) {
8870 +                       di->entry++;
8871 +               }
8872 +               if (di->entry >= di->ls->ls_hashsize) {
8873 +                       read_unlock(&di->ls->ls_reshash_lock);
8874 +                       return NULL;    /* End of hash list */
8875 +               }
8876 +
8877 +               di->next = di->ls->ls_reshashtbl[di->entry].next;
8878 +       } else {                /* Find the next entry in the list */
8879 +
8880 +               di->next = di->next->next;
8881 +               if (di->next->next == di->ls->ls_reshashtbl[di->entry].next) {
8882 +                       /* End of list - move to next bucket */
8883 +                       di->next = NULL;
8884 +                       di->entry++;
8885 +                       read_unlock(&di->ls->ls_reshash_lock);
8886 +
8887 +                       return next_rsb(di);    /* do the top half of this conditional */
8888 +               }
8889 +       }
8890 +       di->rsb = list_entry(di->next, gd_res_t, res_hashchain);
8891 +       read_unlock(&di->ls->ls_reshash_lock);
8892 +
8893 +       return di;
8894 +}
8895 +
8896 +static void *s_start(struct seq_file *m, loff_t * pos)
8897 +{
8898 +       struct ls_dumpinfo *di;
8899 +       gd_ls_t *ls;
8900 +       int i;
8901 +
8902 +       ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
8903 +       if (!ls)
8904 +               return NULL;
8905 +
8906 +       di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
8907 +       if (!di)
8908 +               return NULL;
8909 +
8910 +       if (*pos == 0)
8911 +               seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
8912 +
8913 +       di->entry = 0;
8914 +       di->next = NULL;
8915 +       di->ls = ls;
8916 +
8917 +       for (i = 0; i < *pos; i++)
8918 +               if (next_rsb(di) == NULL)
8919 +                       return NULL;
8920 +
8921 +       return next_rsb(di);
8922 +}
8923 +
8924 +static void *s_next(struct seq_file *m, void *p, loff_t * pos)
8925 +{
8926 +       struct ls_dumpinfo *di = p;
8927 +
8928 +       *pos += 1;
8929 +
8930 +       return next_rsb(di);
8931 +}
8932 +
8933 +static int s_show(struct seq_file *m, void *p)
8934 +{
8935 +       struct ls_dumpinfo *di = p;
8936 +       return print_resource(di->rsb, m);
8937 +}
8938 +
8939 +static void s_stop(struct seq_file *m, void *p)
8940 +{
8941 +       kfree(p);
8942 +}
8943 +
8944 +static struct seq_operations locks_info_op = {
8945 +      start:s_start,
8946 +      next:s_next,
8947 +      stop:s_stop,
8948 +      show:s_show
8949 +};
8950 +
8951 +static char *print_lockmode(int mode)
8952 +{
8953 +       switch (mode) {
8954 +       case DLM_LOCK_IV:
8955 +               return "--";
8956 +       case DLM_LOCK_NL:
8957 +               return "NL";
8958 +       case DLM_LOCK_CR:
8959 +               return "CR";
8960 +       case DLM_LOCK_CW:
8961 +               return "CW";
8962 +       case DLM_LOCK_PR:
8963 +               return "PR";
8964 +       case DLM_LOCK_PW:
8965 +               return "PW";
8966 +       case DLM_LOCK_EX:
8967 +               return "EX";
8968 +       default:
8969 +               return "??";
8970 +       }
8971 +}
8972 +
8973 +static void print_lock(struct seq_file *s, gd_lkb_t * lkb, gd_res_t * res)
8974 +{
8975 +
8976 +       seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
8977 +
8978 +       if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8979 +           || lkb->lkb_status == GDLM_LKSTS_WAITING)
8980 +               seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
8981 +
8982 +       if (lkb->lkb_range) {
8983 +               /* This warns on Alpha. Tough. Only I see it */
8984 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8985 +                   || lkb->lkb_status == GDLM_LKSTS_GRANTED)
8986 +                       seq_printf(s, " %" PRIx64 "-%" PRIx64,
8987 +                                  lkb->lkb_range[GR_RANGE_START],
8988 +                                  lkb->lkb_range[GR_RANGE_END]);
8989 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8990 +                   || lkb->lkb_status == GDLM_LKSTS_WAITING)
8991 +                       seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
8992 +                                  lkb->lkb_range[RQ_RANGE_START],
8993 +                                  lkb->lkb_range[RQ_RANGE_END]);
8994 +       }
8995 +
8996 +       if (lkb->lkb_nodeid) {
8997 +               if (lkb->lkb_nodeid != res->res_nodeid)
8998 +                       seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
8999 +                                  lkb->lkb_remid);
9000 +               else
9001 +                       seq_printf(s, " Master:     %08x", lkb->lkb_remid);
9002 +       }
9003 +
9004 +       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
9005 +               seq_printf(s, "  LQ: %d", lkb->lkb_lockqueue_state);
9006 +
9007 +       seq_printf(s, "\n");
9008 +}
9009 +
9010 +static int print_resource(gd_res_t *res, struct seq_file *s)
9011 +{
9012 +       int i;
9013 +       struct list_head *locklist;
9014 +
9015 +       seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
9016 +                  res->res_parent, res->res_length);
9017 +       for (i = 0; i < res->res_length; i++) {
9018 +               if (isprint(res->res_name[i]))
9019 +                       seq_printf(s, "%c", res->res_name[i]);
9020 +               else
9021 +                       seq_printf(s, "%c", '.');
9022 +       }
9023 +       if (res->res_nodeid)
9024 +               seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
9025 +                          res->res_nodeid);
9026 +       else
9027 +               seq_printf(s, "\"  \nMaster Copy\n");
9028 +
9029 +       /* Print the LVB: */
9030 +       if (res->res_lvbptr) {
9031 +               seq_printf(s, "LVB: ");
9032 +               for (i = 0; i < DLM_LVB_LEN; i++) {
9033 +                       if (i == DLM_LVB_LEN / 2)
9034 +                               seq_printf(s, "\n     ");
9035 +                       seq_printf(s, "%02x ",
9036 +                                  (unsigned char) res->res_lvbptr[i]);
9037 +               }
9038 +               seq_printf(s, "\n");
9039 +       }
9040 +
9041 +       /* Print the locks attached to this resource */
9042 +       seq_printf(s, "Granted Queue\n");
9043 +       list_for_each(locklist, &res->res_grantqueue) {
9044 +               gd_lkb_t *this_lkb =
9045 +                   list_entry(locklist, gd_lkb_t, lkb_statequeue);
9046 +               print_lock(s, this_lkb, res);
9047 +       }
9048 +
9049 +       seq_printf(s, "Conversion Queue\n");
9050 +       list_for_each(locklist, &res->res_convertqueue) {
9051 +               gd_lkb_t *this_lkb =
9052 +                   list_entry(locklist, gd_lkb_t, lkb_statequeue);
9053 +               print_lock(s, this_lkb, res);
9054 +       }
9055 +
9056 +       seq_printf(s, "Waiting Queue\n");
9057 +       list_for_each(locklist, &res->res_waitqueue) {
9058 +               gd_lkb_t *this_lkb =
9059 +                   list_entry(locklist, gd_lkb_t, lkb_statequeue);
9060 +               print_lock(s, this_lkb, res);
9061 +       }
9062 +       return 0;
9063 +}
9064 +#endif                         /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9065 +
9066 +void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...)
9067 +{
9068 +       va_list va;
9069 +       int i, n, size, len;
9070 +       char buf[MAX_DEBUG_MSG_LEN+1];
9071 +
9072 +       spin_lock(&debug_lock);
9073 +
9074 +       if (!debug_buf)
9075 +               goto out;
9076 +
9077 +       size = MAX_DEBUG_MSG_LEN;
9078 +       memset(buf, 0, size+1);
9079 +
9080 +       n = snprintf(buf, size, "%s ", ls->ls_name);
9081 +       size -= n;
9082 +
9083 +       va_start(va, fmt);
9084 +       vsnprintf(buf+n, size, fmt, va);
9085 +       va_end(va);
9086 +
9087 +       len = strlen(buf);
9088 +       if (len > MAX_DEBUG_MSG_LEN-1)
9089 +               len = MAX_DEBUG_MSG_LEN-1;
9090 +       buf[len] = '\n';
9091 +       buf[len+1] = '\0';
9092 +
9093 +       for (i = 0; i < strlen(buf); i++) {
9094 +               debug_buf[debug_point++] = buf[i];
9095 +
9096 +               if (debug_point == debug_size) {
9097 +                       debug_point = 0;
9098 +                       debug_wrap = 1;
9099 +               }
9100 +       }
9101 + out:
9102 +       spin_unlock(&debug_lock);
9103 +}
9104 +
9105 +void dlm_debug_dump(void)
9106 +{
9107 +       int i;
9108 +
9109 +       spin_lock(&debug_lock);
9110 +       if (debug_wrap) {
9111 +               for (i = debug_point; i < debug_size; i++)
9112 +                       printk("%c", debug_buf[i]);
9113 +       }
9114 +       for (i = 0; i < debug_point; i++)
9115 +               printk("%c", debug_buf[i]);
9116 +       spin_unlock(&debug_lock);
9117 +}
9118 +
9119 +void dlm_debug_setup(int size)
9120 +{
9121 +       char *b = NULL;
9122 +
9123 +       if (size > PAGE_SIZE)
9124 +               size = PAGE_SIZE;
9125 +       if (size)
9126 +               b = kmalloc(size, GFP_KERNEL);
9127 +
9128 +       spin_lock(&debug_lock);
9129 +       if (debug_buf)
9130 +               kfree(debug_buf);
9131 +       if (!size || !b)
9132 +               goto out;
9133 +       debug_size = size;
9134 +       debug_point = 0;
9135 +       debug_wrap = 0;
9136 +       debug_buf = b;
9137 +       memset(debug_buf, 0, debug_size);
9138 + out:
9139 +        spin_unlock(&debug_lock);
9140 +}
9141 +
9142 +static void dlm_debug_init(void)
9143 +{
9144 +       debug_buf = NULL;
9145 +        debug_size = 0;
9146 +       debug_point = 0;
9147 +       debug_wrap = 0;
9148 +       spin_lock_init(&debug_lock);
9149 +
9150 +       dlm_debug_setup(DLM_DEBUG_SIZE);
9151 +}
9152 +
9153 +#ifdef CONFIG_PROC_FS
9154 +int dlm_debug_info(char *b, char **start, off_t offset, int length)
9155 +{
9156 +       int i, n = 0;
9157 +
9158 +       spin_lock(&debug_lock);
9159 +
9160 +       if (debug_wrap) {
9161 +               for (i = debug_point; i < debug_size; i++)
9162 +                       n += sprintf(b + n, "%c", debug_buf[i]);
9163 +       }
9164 +       for (i = 0; i < debug_point; i++)
9165 +               n += sprintf(b + n, "%c", debug_buf[i]);
9166 +
9167 +       spin_unlock(&debug_lock);
9168 +
9169 +       return n;
9170 +}
9171 +
9172 +int dlm_rcom_info(char *b, char **start, off_t offset, int length)
9173 +{
9174 +       gd_ls_t *ls;
9175 +       gd_csb_t *csb;
9176 +       int n = 0;
9177 +
9178 +       ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9179 +       if (!ls)
9180 +               return 0;
9181 +
9182 +       n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
9183 +                                  "names_recv_count names_recv_msgid "
9184 +                                  "locks_send_count locks_send_msgid "
9185 +                                  "locks_recv_count locks_recv_msgid\n");
9186 +
9187 +       list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
9188 +               n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
9189 +                            csb->csb_node->gn_nodeid,
9190 +                            csb->csb_names_send_count,
9191 +                            csb->csb_names_send_msgid,
9192 +                            csb->csb_names_recv_count,
9193 +                            csb->csb_names_recv_msgid,
9194 +                            csb->csb_locks_send_count,
9195 +                            csb->csb_locks_send_msgid,
9196 +                            csb->csb_locks_recv_count,
9197 +                            csb->csb_locks_recv_msgid);
9198 +        }
9199 +       return n;
9200 +}
9201 +#endif
9202 +
9203 +void dlm_proc_init(void)
9204 +{
9205 +#ifdef CONFIG_PROC_FS
9206 +       debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
9207 +                                            NULL);
9208 +       if (!debug_proc_entry)
9209 +               return;
9210 +
9211 +       debug_proc_entry->get_info = &dlm_debug_info;
9212 +
9213 +       rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
9214 +       if (!rcom_proc_entry)
9215 +               return;
9216 +
9217 +       rcom_proc_entry->get_info = &dlm_rcom_info;
9218 +#endif
9219 +       dlm_debug_init();
9220 +
9221 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9222 +       locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
9223 +                                                 S_IFREG | 0400,
9224 +                                                 NULL, NULL, NULL);
9225 +       if (!locks_proc_entry)
9226 +               return;
9227 +       locks_proc_entry->proc_fops = &locks_fops;
9228 +#endif
9229 +}
9230 +
9231 +void dlm_proc_exit(void)
9232 +{
9233 +#ifdef CONFIG_PROC_FS
9234 +       if (debug_proc_entry) {
9235 +               remove_proc_entry("cluster/dlm_debug", NULL);
9236 +               dlm_debug_setup(0);
9237 +       }
9238 +
9239 +       if (rcom_proc_entry)
9240 +               remove_proc_entry("cluster/dlm_rcom", NULL);
9241 +#endif
9242 +
9243 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9244 +       if (locks_proc_entry)
9245 +               remove_proc_entry("cluster/dlm_locks", NULL);
9246 +#endif
9247 +}
9248 diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
9249 --- linux-orig/cluster/dlm/queries.c    1970-01-01 07:30:00.000000000 +0730
9250 +++ linux-patched/cluster/dlm/queries.c 2004-06-25 18:31:07.000000000 +0800
9251 @@ -0,0 +1,697 @@
9252 +/******************************************************************************
9253 +*******************************************************************************
9254 +**
9255 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9256 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9257 +**
9258 +**  This copyrighted material is made available to anyone wishing to use,
9259 +**  modify, copy, or redistribute it subject to the terms and conditions
9260 +**  of the GNU General Public License v.2.
9261 +**
9262 +*******************************************************************************
9263 +******************************************************************************/
9264 +
9265 +/*
9266 + * queries.c
9267 + *
9268 + * This file provides the kernel query interface to the DLM.
9269 + *
9270 + */
9271 +
9272 +#define EXPORT_SYMTAB
9273 +#include <linux/module.h>
9274 +
9275 +#include "dlm_internal.h"
9276 +#include "lockqueue.h"
9277 +#include "locking.h"
9278 +#include "lkb.h"
9279 +#include "nodes.h"
9280 +#include "dir.h"
9281 +#include "ast.h"
9282 +#include "memory.h"
9283 +#include "lowcomms.h"
9284 +#include "midcomms.h"
9285 +#include "rsb.h"
9286 +
9287 +static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo);
9288 +static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo);
9289 +
9290 +/*
9291 + * API entry point.
9292 + */
9293 +int dlm_query(void *lockspace,
9294 +             struct dlm_lksb *lksb,
9295 +             int query,
9296 +             struct dlm_queryinfo *qinfo,
9297 +             void (ast_routine(void *)),
9298 +             void *astarg)
9299 +{
9300 +       int status = -EINVAL;
9301 +       gd_lkb_t *target_lkb;
9302 +       gd_lkb_t *query_lkb = NULL;     /* Our temporary LKB */
9303 +       gd_ls_t  *ls = (gd_ls_t *) find_lockspace_by_local_id(lockspace);
9304 +
9305 +
9306 +       if (!qinfo)
9307 +               goto out;
9308 +       if (!ls)
9309 +               goto out;
9310 +       if (!ast_routine)
9311 +               goto out;
9312 +       if (!lksb)
9313 +               goto out;
9314 +
9315 +       if (!qinfo->gqi_lockinfo)
9316 +               qinfo->gqi_locksize = 0;
9317 +
9318 +        /* Find the lkid */
9319 +       target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
9320 +       if (!target_lkb)
9321 +               goto out;
9322 +
9323 +       /* If the user wants a list of locks that are blocking or
9324 +          not blocking this lock, then it must be waiting
9325 +          for something
9326 +       */
9327 +       if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
9328 +            (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
9329 +           target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
9330 +               return -EINVAL;
9331 +
9332 +       /* We now allocate an LKB for our own use (so we can hang
9333 +        * things like the AST routine and the lksb from it) */
9334 +       lksb->sb_status = -EBUSY;
9335 +       query_lkb = create_lkb(ls);
9336 +       if (!query_lkb) {
9337 +               status = -ENOMEM;
9338 +               goto out;
9339 +       }
9340 +       query_lkb->lkb_astaddr  = ast_routine;
9341 +       query_lkb->lkb_astparam = (long)astarg;
9342 +       query_lkb->lkb_resource = target_lkb->lkb_resource;
9343 +       query_lkb->lkb_lksb     = lksb;
9344 +
9345 +       /* Don't free the resource while we are querying it. This ref
9346 +        * will be dropped when the LKB is freed */
9347 +       hold_rsb(query_lkb->lkb_resource);
9348 +
9349 +       /* Fill in the stuff that's always local */
9350 +       if (qinfo->gqi_resinfo) {
9351 +               if (target_lkb->lkb_resource->res_nodeid)
9352 +                       qinfo->gqi_resinfo->rsi_masternode =
9353 +                               target_lkb->lkb_resource->res_nodeid;
9354 +               else
9355 +                       qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
9356 +               qinfo->gqi_resinfo->rsi_length =
9357 +                       target_lkb->lkb_resource->res_length;
9358 +               memcpy(qinfo->gqi_resinfo->rsi_name,
9359 +                      target_lkb->lkb_resource->res_name,
9360 +                      qinfo->gqi_resinfo->rsi_length);
9361 +       }
9362 +
9363 +       /* If the master is local (or the user doesn't want the overhead of a
9364 +        * remote call) - fill in the details here */
9365 +       if (target_lkb->lkb_resource->res_nodeid == 0 ||
9366 +           (query & DLM_QUERY_LOCAL)) {
9367 +
9368 +               status = 0;
9369 +               /* Resource info */
9370 +               if (qinfo->gqi_resinfo) {
9371 +                       query_resource(target_lkb->lkb_resource,
9372 +                                      qinfo->gqi_resinfo);
9373 +               }
9374 +
9375 +               /* Lock lists */
9376 +               if (qinfo->gqi_lockinfo) {
9377 +                       status = query_locks(query, target_lkb, qinfo);
9378 +               }
9379 +
9380 +               query_lkb->lkb_retstatus = status;
9381 +               query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
9382 +               queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
9383 +               wake_astd();
9384 +
9385 +               /* An AST will be delivered so we must return success here */
9386 +               status = 0;
9387 +               goto out;
9388 +       }
9389 +
9390 +       /* Remote master */
9391 +       if (target_lkb->lkb_resource->res_nodeid != 0)
9392 +       {
9393 +               struct gd_remquery *remquery;
9394 +               struct writequeue_entry *e;
9395 +
9396 +               /* Clear this cos the receiving end adds to it with
9397 +                  each incoming packet */
9398 +               qinfo->gqi_lockcount = 0;
9399 +
9400 +               /* Squirrel a pointer to the query info struct
9401 +                  somewhere illegal */
9402 +               query_lkb->lkb_request = (struct gd_remlockrequest *) qinfo;
9403 +
9404 +               e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
9405 +                                       sizeof(struct gd_remquery),
9406 +                                       ls->ls_allocation,
9407 +                                       (char **) &remquery);
9408 +               if (!e) {
9409 +                       status = -ENOBUFS;
9410 +                       goto out;
9411 +               }
9412 +
9413 +               /* Build remote packet */
9414 +               memset(remquery, 0, sizeof(struct gd_remquery));
9415 +
9416 +               remquery->rq_maxlocks  = qinfo->gqi_locksize;
9417 +               remquery->rq_query     = query;
9418 +               remquery->rq_mstlkid   = target_lkb->lkb_remid;
9419 +               if (qinfo->gqi_lockinfo)
9420 +                       remquery->rq_maxlocks = qinfo->gqi_locksize;
9421 +
9422 +               remquery->rq_header.rh_cmd       = GDLM_REMCMD_QUERY;
9423 +               remquery->rq_header.rh_flags     = 0;
9424 +               remquery->rq_header.rh_length    = sizeof(struct gd_remquery);
9425 +               remquery->rq_header.rh_lkid      = query_lkb->lkb_id;
9426 +               remquery->rq_header.rh_lockspace = ls->ls_global_id;
9427 +
9428 +               midcomms_send_buffer(&remquery->rq_header, e);
9429 +               status = 0;
9430 +       }
9431 +
9432 +      out:
9433 +
9434 +       return status;
9435 +}
9436 +
9437 +static inline int valid_range(struct dlm_range *r)
9438 +{
9439 +    if (r->ra_start != 0ULL ||
9440 +       r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
9441 +       return 1;
9442 +    else
9443 +       return 0;
9444 +}
9445 +
9446 +static void put_int(int x, char *buf, int *offp)
9447 +{
9448 +        x = cpu_to_le32(x);
9449 +        memcpy(buf + *offp, &x, sizeof(int));
9450 +        *offp += sizeof(int);
9451 +}
9452 +
9453 +static void put_int64(uint64_t x, char *buf, int *offp)
9454 +{
9455 +        x = cpu_to_le64(x);
9456 +        memcpy(buf + *offp, &x, sizeof(uint64_t));
9457 +        *offp += sizeof(uint64_t);
9458 +}
9459 +
9460 +static int get_int(char *buf, int *offp)
9461 +{
9462 +        int value;
9463 +        memcpy(&value, buf + *offp, sizeof(int));
9464 +        *offp += sizeof(int);
9465 +        return le32_to_cpu(value);
9466 +}
9467 +
9468 +static uint64_t get_int64(char *buf, int *offp)
9469 +{
9470 +        uint64_t value;
9471 +
9472 +        memcpy(&value, buf + *offp, sizeof(uint64_t));
9473 +        *offp += sizeof(uint64_t);
9474 +        return le64_to_cpu(value);
9475 +}
9476 +
9477 +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
9478 +
9479 +/* Called from recvd to get lock info for a remote node */
9480 +int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
9481 +{
9482 +        struct gd_remquery *query = (struct gd_remquery *) msg;
9483 +       struct gd_remqueryreply *reply;
9484 +       struct dlm_resinfo resinfo;
9485 +       struct dlm_queryinfo qinfo;
9486 +       struct writequeue_entry *e;
9487 +       char *buf;
9488 +       gd_lkb_t *lkb;
9489 +       int status = 0;
9490 +       int bufidx;
9491 +       int finished = 0;
9492 +       int cur_lock = 0;
9493 +       int start_lock = 0;
9494 +
9495 +       lkb = find_lock_by_id(ls, query->rq_mstlkid);
9496 +       if (!lkb) {
9497 +               status = -EINVAL;
9498 +               goto send_error;
9499 +       }
9500 +
9501 +       qinfo.gqi_resinfo = &resinfo;
9502 +       qinfo.gqi_locksize = query->rq_maxlocks;
9503 +
9504 +       /* Get the resource bits */
9505 +       query_resource(lkb->lkb_resource, &resinfo);
9506 +
9507 +       /* Now get the locks if wanted */
9508 +       if (query->rq_maxlocks) {
9509 +               qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
9510 +                                            GFP_KERNEL);
9511 +               if (!qinfo.gqi_lockinfo) {
9512 +                       status = -ENOMEM;
9513 +                       goto send_error;
9514 +               }
9515 +
9516 +               status = query_locks(query->rq_query, lkb, &qinfo);
9517 +               if (status && status != -E2BIG) {
9518 +                       kfree(qinfo.gqi_lockinfo);
9519 +                       goto send_error;
9520 +               }
9521 +       }
9522 +       else {
9523 +               qinfo.gqi_lockinfo = NULL;
9524 +               qinfo.gqi_lockcount = 0;
9525 +       }
9526 +
9527 +       /* Send as many blocks as needed for all the locks */
9528 +       do {
9529 +               int i;
9530 +               int msg_len = sizeof(struct gd_remqueryreply);
9531 +               int last_msg_len = msg_len; /* keeps compiler quiet */
9532 +               int last_lock;
9533 +
9534 +               /* First work out how many locks we can fit into a block */
9535 +               for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
9536 +
9537 +                       last_msg_len = msg_len;
9538 +
9539 +                       msg_len += LOCK_LEN;
9540 +                       if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
9541 +                           valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
9542 +
9543 +                               msg_len += sizeof(uint64_t) * 4;
9544 +                       }
9545 +               }
9546 +
9547 +               /* There must be a neater way of doing this... */
9548 +               if (msg_len > PAGE_SIZE) {
9549 +                       last_lock = i-1;
9550 +                       msg_len = last_msg_len;
9551 +               }
9552 +               else {
9553 +                       last_lock = i;
9554 +               }
9555 +
9556 +               e = lowcomms_get_buffer(nodeid,
9557 +                                       msg_len,
9558 +                                       ls->ls_allocation,
9559 +                                       (char **) &reply);
9560 +               if (!e) {
9561 +                       kfree(qinfo.gqi_lockinfo);
9562 +                       status = -ENOBUFS;
9563 +                       goto out;
9564 +               }
9565 +
9566 +               reply->rq_header.rh_cmd       = GDLM_REMCMD_QUERYREPLY;
9567 +               reply->rq_header.rh_length    = msg_len;
9568 +               reply->rq_header.rh_lkid      = msg->rh_lkid;
9569 +               reply->rq_header.rh_lockspace = msg->rh_lockspace;
9570 +
9571 +               reply->rq_status     = status;
9572 +               reply->rq_startlock  = cur_lock;
9573 +               reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
9574 +               reply->rq_convcount  = qinfo.gqi_resinfo->rsi_convcount;
9575 +               reply->rq_waitcount  = qinfo.gqi_resinfo->rsi_waitcount;
9576 +               memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
9577 +
9578 +               buf = (char *)reply;
9579 +               bufidx = sizeof(struct gd_remqueryreply);
9580 +
9581 +               for (; cur_lock < last_lock; cur_lock++) {
9582 +
9583 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
9584 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
9585 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
9586 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
9587 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
9588 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
9589 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
9590 +
9591 +                       if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
9592 +                           valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
9593 +
9594 +                               buf[bufidx++] = 1;
9595 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
9596 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
9597 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
9598 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
9599 +                       }
9600 +                       else {
9601 +                               buf[bufidx++] = 0;
9602 +                       }
9603 +               }
9604 +
9605 +               if (cur_lock == qinfo.gqi_lockcount) {
9606 +                       reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
9607 +                       finished = 1;
9608 +               }
9609 +               else {
9610 +                       reply->rq_header.rh_flags = 0;
9611 +               }
9612 +
9613 +               reply->rq_numlocks = cur_lock - start_lock;
9614 +               start_lock = cur_lock;
9615 +
9616 +               midcomms_send_buffer(&reply->rq_header, e);
9617 +       } while (!finished);
9618 +
9619 +       kfree(qinfo.gqi_lockinfo);
9620 + out:
9621 +       return status;
9622 +
9623 + send_error:
9624 +       e = lowcomms_get_buffer(nodeid,
9625 +                               sizeof(struct gd_remqueryreply),
9626 +                               ls->ls_allocation,
9627 +                               (char **) &reply);
9628 +       if (!e) {
9629 +               status =  -ENOBUFS;
9630 +               goto out;
9631 +       }
9632 +       reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9633 +       reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
9634 +       reply->rq_header.rh_length = sizeof(struct gd_remqueryreply);
9635 +       reply->rq_header.rh_lkid = msg->rh_lkid;
9636 +       reply->rq_header.rh_lockspace = msg->rh_lockspace;
9637 +       reply->rq_status     = status;
9638 +       reply->rq_numlocks   = 0;
9639 +       reply->rq_startlock  = 0;
9640 +       reply->rq_grantcount = 0;
9641 +       reply->rq_convcount  = 0;
9642 +       reply->rq_waitcount  = 0;
9643 +
9644 +       midcomms_send_buffer(&reply->rq_header, e);
9645 +
9646 +       return status;
9647 +}
9648 +
9649 +/* Reply to a remote query */
9650 +int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
9651 +{
9652 +       gd_lkb_t *query_lkb;
9653 +       struct dlm_queryinfo *qinfo;
9654 +       struct gd_remqueryreply *reply;
9655 +       char *buf;
9656 +       int i;
9657 +       int bufidx;
9658 +
9659 +       query_lkb = find_lock_by_id(ls, msg->rh_lkid);
9660 +       if (!query_lkb)
9661 +               return -EINVAL;
9662 +
9663 +       qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
9664 +       reply = (struct gd_remqueryreply *) msg;
9665 +
9666 +       /* Copy the easy bits first */
9667 +       qinfo->gqi_lockcount += reply->rq_numlocks;
9668 +       if (qinfo->gqi_resinfo) {
9669 +               qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
9670 +               qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
9671 +               qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
9672 +               memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
9673 +                       DLM_LVB_LEN);
9674 +       }
9675 +
9676 +       /* Now unpack the locks */
9677 +       bufidx = sizeof(struct gd_remqueryreply);
9678 +       buf = (char *) msg;
9679 +
9680 +       GDLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
9681 +                   printk("start = %d, num + %d. Max=  %d\n",
9682 +                          reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
9683 +
9684 +       for (i = reply->rq_startlock;
9685 +            i < reply->rq_startlock + reply->rq_numlocks; i++) {
9686 +               qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
9687 +               qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
9688 +               qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
9689 +               qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
9690 +               qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
9691 +               qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
9692 +               qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
9693 +               if (buf[bufidx++]) {
9694 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
9695 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = get_int64(buf, &bufidx);
9696 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
9697 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = get_int64(buf, &bufidx);
9698 +               }
9699 +               else {
9700 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
9701 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
9702 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
9703 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
9704 +               }
9705 +       }
9706 +
9707 +       /* If this was the last block then now tell the user */
9708 +       if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
9709 +               query_lkb->lkb_retstatus = reply->rq_status;
9710 +               query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
9711 +               queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
9712 +               wake_astd();
9713 +       }
9714 +
9715 +       return 0;
9716 +}
9717 +
9718 +/* Aggregate resource information */
9719 +static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo)
9720 +{
9721 +       struct list_head *tmp;
9722 +
9723 +
9724 +       if (rsb->res_lvbptr)
9725 +               memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
9726 +
9727 +       resinfo->rsi_grantcount = 0;
9728 +       list_for_each(tmp, &rsb->res_grantqueue) {
9729 +               resinfo->rsi_grantcount++;
9730 +       }
9731 +
9732 +       resinfo->rsi_waitcount = 0;
9733 +       list_for_each(tmp, &rsb->res_waitqueue) {
9734 +               resinfo->rsi_waitcount++;
9735 +       }
9736 +
9737 +       resinfo->rsi_convcount = 0;
9738 +       list_for_each(tmp, &rsb->res_convertqueue) {
9739 +               resinfo->rsi_convcount++;
9740 +       }
9741 +
9742 +       return 0;
9743 +}
9744 +
9745 +static int add_lock(gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
9746 +{
9747 +       int entry;
9748 +
9749 +       /* Don't fill it in if the buffer is full */
9750 +       if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
9751 +               return -E2BIG;
9752 +
9753 +       /* gqi_lockcount contains the number of locks we have returned */
9754 +       entry = qinfo->gqi_lockcount++;
9755 +
9756 +       /* Fun with master copies */
9757 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
9758 +               qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
9759 +               qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
9760 +       }
9761 +       else {
9762 +               qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
9763 +               qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
9764 +       }
9765 +
9766 +       /* Also make sure we always have a valid nodeid in there, the
9767 +          calling end may not know which node "0" is */
9768 +       if (lkb->lkb_nodeid)
9769 +           qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
9770 +       else
9771 +           qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
9772 +
9773 +       if (lkb->lkb_parent)
9774 +               qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
9775 +       else
9776 +               qinfo->gqi_lockinfo[entry].lki_parent = 0;
9777 +
9778 +       qinfo->gqi_lockinfo[entry].lki_state  = lkb->lkb_status;
9779 +       qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
9780 +       qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
9781 +
9782 +       if (lkb->lkb_range) {
9783 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
9784 +                       lkb->lkb_range[GR_RANGE_START];
9785 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
9786 +                       lkb->lkb_range[GR_RANGE_END];
9787 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
9788 +                       lkb->lkb_range[RQ_RANGE_START];
9789 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
9790 +                       lkb->lkb_range[RQ_RANGE_END];
9791 +       } else {
9792 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
9793 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
9794 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
9795 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
9796 +       }
9797 +       return 0;
9798 +}
9799 +
9800 +static int query_lkb_queue(struct list_head *queue, int query,
9801 +                          struct dlm_queryinfo *qinfo)
9802 +{
9803 +       struct list_head *tmp;
9804 +       int status = 0;
9805 +       int mode = query & DLM_QUERY_MODE_MASK;
9806 +
9807 +       list_for_each(tmp, queue) {
9808 +               gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9809 +               int lkmode;
9810 +
9811 +               if (query & DLM_QUERY_RQMODE)
9812 +                       lkmode = lkb->lkb_rqmode;
9813 +               else
9814 +                       lkmode = lkb->lkb_grmode;
9815 +
9816 +               /* Add the LKB info to the list if it matches the criteria in
9817 +                * the query bitmap */
9818 +               switch (query & DLM_QUERY_MASK) {
9819 +               case DLM_QUERY_LOCKS_ALL:
9820 +                       status = add_lock(lkb, qinfo);
9821 +                       break;
9822 +
9823 +               case DLM_QUERY_LOCKS_HIGHER:
9824 +                       if (lkmode > mode)
9825 +                               status = add_lock(lkb, qinfo);
9826 +                       break;
9827 +
9828 +               case DLM_QUERY_LOCKS_EQUAL:
9829 +                       if (lkmode == mode)
9830 +                               status = add_lock(lkb, qinfo);
9831 +                       break;
9832 +
9833 +               case DLM_QUERY_LOCKS_LOWER:
9834 +                       if (lkmode < mode)
9835 +                               status = add_lock(lkb, qinfo);
9836 +                       break;
9837 +               }
9838 +       }
9839 +       return status;
9840 +}
9841 +
9842 +/*
9843 + * Return 1 if the locks' ranges overlap
9844 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
9845 + */
9846 +static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
9847 +{
9848 +       if (!lkb1->lkb_range || !lkb2->lkb_range)
9849 +               return 1;
9850 +
9851 +       if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
9852 +           lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
9853 +               return 0;
9854 +
9855 +       return 1;
9856 +}
9857 +extern const int __dlm_compat_matrix[8][8];
9858 +
9859 +
9860 +static int get_blocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
9861 +{
9862 +       struct list_head *tmp;
9863 +       int status = 0;
9864 +
9865 +       list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9866 +               gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9867 +
9868 +               if (ranges_overlap(lkb, qlkb) &&
9869 +                   !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
9870 +                       status = add_lock(lkb, qinfo);
9871 +       }
9872 +
9873 +       return status;
9874 +}
9875 +
9876 +static int get_nonblocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
9877 +{
9878 +       struct list_head *tmp;
9879 +       int status = 0;
9880 +
9881 +       list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9882 +               gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9883 +
9884 +               if (!(ranges_overlap(lkb, qlkb) &&
9885 +                     !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
9886 +                       status = add_lock(lkb, qinfo);
9887 +       }
9888 +
9889 +       return status;
9890 +}
9891 +
9892 +/* Gather a list of appropriate locks */
9893 +static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
9894 +{
9895 +       int status = 0;
9896 +
9897 +
9898 +       /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
9899 +        * was requested as the mode
9900 +        */
9901 +       if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
9902 +               query &= ~DLM_QUERY_MODE_MASK;
9903 +               if (query & DLM_QUERY_RQMODE)
9904 +                       query |= lkb->lkb_rqmode;
9905 +               else
9906 +                       query |= lkb->lkb_grmode;
9907 +       }
9908 +
9909 +       qinfo->gqi_lockcount = 0;
9910 +
9911 +       /* BLOCKING/NOTBLOCK only look at the granted queue */
9912 +       if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
9913 +               return get_blocking_locks(lkb, qinfo);
9914 +
9915 +       if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
9916 +               return get_nonblocking_locks(lkb, qinfo);
9917 +
9918 +        /* Do the lock queues that were requested */
9919 +       if (query & DLM_QUERY_QUEUE_GRANT) {
9920 +               status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
9921 +                                        query, qinfo);
9922 +       }
9923 +
9924 +       if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
9925 +               status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
9926 +                                        query, qinfo);
9927 +       }
9928 +
9929 +       if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
9930 +               status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
9931 +                                        query, qinfo);
9932 +       }
9933 +
9934 +
9935 +       return status;
9936 +}
9937 +
9938 +EXPORT_SYMBOL(dlm_query);
9939 +/*
9940 + * Overrides for Emacs so that we follow Linus's tabbing style.
9941 + * Emacs will notice this stuff at the end of the file and automatically
9942 + * adjust the settings for this buffer only.  This must remain at the end
9943 + * of the file.
9944 + * ---------------------------------------------------------------------------
9945 + * Local variables:
9946 + * c-file-style: "linux"
9947 + * End:
9948 + */
9949 diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
9950 --- linux-orig/cluster/dlm/queries.h    1970-01-01 07:30:00.000000000 +0730
9951 +++ linux-patched/cluster/dlm/queries.h 2004-06-25 18:31:07.000000000 +0800
9952 @@ -0,0 +1,20 @@
9953 +/******************************************************************************
9954 +*******************************************************************************
9955 +**
9956 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9957 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9958 +**
9959 +**  This copyrighted material is made available to anyone wishing to use,
9960 +**  modify, copy, or redistribute it subject to the terms and conditions
9961 +**  of the GNU General Public License v.2.
9962 +**
9963 +*******************************************************************************
9964 +******************************************************************************/
9965 +
9966 +#ifndef __QUERIES_DOT_H__
9967 +#define __QUERIES_DOT_H__
9968 +
9969 +extern int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
9970 +extern int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
9971 +
9972 +#endif                          /* __QUERIES_DOT_H__ */
9973 diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
9974 --- linux-orig/cluster/dlm/rebuild.c    1970-01-01 07:30:00.000000000 +0730
9975 +++ linux-patched/cluster/dlm/rebuild.c 2004-06-25 18:31:07.000000000 +0800
9976 @@ -0,0 +1,1246 @@
9977 +/******************************************************************************
9978 +*******************************************************************************
9979 +**
9980 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9981 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9982 +**
9983 +**  This copyrighted material is made available to anyone wishing to use,
9984 +**  modify, copy, or redistribute it subject to the terms and conditions
9985 +**  of the GNU General Public License v.2.
9986 +**
9987 +*******************************************************************************
9988 +******************************************************************************/
9989 +
9990 +/*
9991 + * Rebuild RSB's on new masters.  Functions for transferring locks and
9992 + * subresources to new RSB masters during recovery.
9993 + */
9994 +
9995 +#include "dlm_internal.h"
9996 +#include "reccomms.h"
9997 +#include "lkb.h"
9998 +#include "rsb.h"
9999 +#include "nodes.h"
10000 +#include "config.h"
10001 +#include "memory.h"
10002 +#include "recover.h"
10003 +
10004 +
10005 +/* Types of entity serialised in remastering messages */
10006 +#define REMASTER_ROOTRSB 1
10007 +#define REMASTER_RSB     2
10008 +#define REMASTER_LKB     3
10009 +
10010 +struct rcom_fill {
10011 +       char *                  outbuf;         /* Beginning of data */
10012 +       int                     offset;         /* Current offset into outbuf */
10013 +       int                     maxlen;         /* Max value of offset */
10014 +       int                     remasterid;
10015 +       int                     count;
10016 +       gd_res_t *              rsb;
10017 +       gd_res_t *              subrsb;
10018 +       gd_lkb_t *              lkb;
10019 +       struct list_head *      lkbqueue;
10020 +       char                    more;
10021 +};
10022 +typedef struct rcom_fill rcom_fill_t;
10023 +
10024 +
10025 +struct rebuild_node {
10026 +       struct list_head        list;
10027 +       int                     nodeid;
10028 +       gd_res_t *              rootrsb;
10029 +};
10030 +typedef struct rebuild_node rebuild_node_t;
10031 +
10032 +
10033 +/*
10034 + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10035 + * master.  The rsb will be "done" with recovery when the new master has
10036 + * replied with all the new remote lockid's for this rsb's lkb's.
10037 + */
10038 +
10039 +void expect_new_lkids(gd_res_t *rsb)
10040 +{
10041 +       rsb->res_newlkid_expect = 0;
10042 +       recover_list_add(rsb);
10043 +}
10044 +
10045 +/*
10046 + * This function is called on root rsb or subrsb when another lkb is being sent
10047 + * to the new master for which we expect to receive a corresponding remote lkid
10048 + */
10049 +
10050 +void need_new_lkid(gd_res_t *rsb)
10051 +{
10052 +       gd_res_t *root = rsb;
10053 +
10054 +       if (rsb->res_parent)
10055 +               root = rsb->res_root;
10056 +
10057 +       if (!root->res_newlkid_expect)
10058 +               recover_list_add(root);
10059 +       else
10060 +               GDLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
10061 +
10062 +       root->res_newlkid_expect++;
10063 +}
10064 +
10065 +/*
10066 + * This function is called for each lkb for which a new remote lkid is
10067 + * received.  Decrement the expected number of remote lkids expected for the
10068 + * root rsb.
10069 + */
10070 +
10071 +void have_new_lkid(gd_lkb_t *lkb)
10072 +{
10073 +       gd_res_t *root = lkb->lkb_resource;
10074 +
10075 +       if (root->res_parent)
10076 +               root = root->res_root;
10077 +
10078 +       down_write(&root->res_lock);
10079 +
10080 +       GDLM_ASSERT(root->res_newlkid_expect,
10081 +                   printk("newlkid_expect=%d\n", root->res_newlkid_expect););
10082 +
10083 +       root->res_newlkid_expect--;
10084 +
10085 +       if (!root->res_newlkid_expect) {
10086 +               clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10087 +               recover_list_del(root);
10088 +       }
10089 +       up_write(&root->res_lock);
10090 +}
10091 +
10092 +/*
10093 + * Return the rebuild struct for a node - will create an entry on the rootrsb
10094 + * list if necessary.
10095 + *
10096 + * Currently no locking is needed here as it all happens in the gdlm_recvd
10097 + * thread
10098 + */
10099 +
10100 +static rebuild_node_t *find_rebuild_root(gd_ls_t *ls, int nodeid)
10101 +{
10102 +       rebuild_node_t *node = NULL;
10103 +
10104 +       list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10105 +               if (node->nodeid == nodeid)
10106 +                       return node;
10107 +       }
10108 +
10109 +       /* Not found, add one */
10110 +       node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10111 +       if (!node)
10112 +               return NULL;
10113 +
10114 +       node->nodeid = nodeid;
10115 +       node->rootrsb = NULL;
10116 +       list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10117 +
10118 +       return node;
10119 +}
10120 +
10121 +/*
10122 + * Tidy up after a rebuild run.  Called when all recovery has finished
10123 + */
10124 +
10125 +void rebuild_freemem(gd_ls_t *ls)
10126 +{
10127 +       rebuild_node_t *node = NULL, *s;
10128 +
10129 +       list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10130 +               list_del(&node->list);
10131 +               kfree(node);
10132 +       }
10133 +}
10134 +
10135 +static void put_int(int x, char *buf, int *offp)
10136 +{
10137 +       x = cpu_to_le32(x);
10138 +       memcpy(buf + *offp, &x, sizeof(int));
10139 +       *offp += sizeof(int);
10140 +}
10141 +
10142 +static void put_int64(uint64_t x, char *buf, int *offp)
10143 +{
10144 +       x = cpu_to_le64(x);
10145 +       memcpy(buf + *offp, &x, sizeof(uint64_t));
10146 +       *offp += sizeof(uint64_t);
10147 +}
10148 +
10149 +static void put_bytes(char *x, int len, char *buf, int *offp)
10150 +{
10151 +       put_int(len, buf, offp);
10152 +       memcpy(buf + *offp, x, len);
10153 +       *offp += len;
10154 +}
10155 +
10156 +static void put_char(char x, char *buf, int *offp)
10157 +{
10158 +       buf[*offp] = x;
10159 +       *offp += 1;
10160 +}
10161 +
10162 +static int get_int(char *buf, int *offp)
10163 +{
10164 +       int value;
10165 +       memcpy(&value, buf + *offp, sizeof(int));
10166 +       *offp += sizeof(int);
10167 +       return le32_to_cpu(value);
10168 +}
10169 +
10170 +static uint64_t get_int64(char *buf, int *offp)
10171 +{
10172 +       uint64_t value;
10173 +
10174 +       memcpy(&value, buf + *offp, sizeof(uint64_t));
10175 +       *offp += sizeof(uint64_t);
10176 +       return le64_to_cpu(value);
10177 +}
10178 +
10179 +static char get_char(char *buf, int *offp)
10180 +{
10181 +       char x = buf[*offp];
10182 +
10183 +       *offp += 1;
10184 +       return x;
10185 +}
10186 +
10187 +static void get_bytes(char *bytes, int *len, char *buf, int *offp)
10188 +{
10189 +       *len = get_int(buf, offp);
10190 +       memcpy(bytes, buf + *offp, *len);
10191 +       *offp += *len;
10192 +}
10193 +
10194 +static int lkb_length(gd_lkb_t *lkb)
10195 +{
10196 +       int len = 0;
10197 +
10198 +       len += sizeof(int);     /* lkb_id */
10199 +       len += sizeof(int);     /* lkb_resource->res_reamasterid */
10200 +       len += sizeof(int);     /* lkb_flags */
10201 +       len += sizeof(int);     /* lkb_status */
10202 +       len += sizeof(char);    /* lkb_rqmode */
10203 +       len += sizeof(char);    /* lkb_grmode */
10204 +       len += sizeof(int);     /* lkb_childcnt */
10205 +       len += sizeof(int);     /* lkb_parent->lkb_id */
10206 +       len += sizeof(int);     /* lkb_bastaddr */
10207 +
10208 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10209 +               len += sizeof(int);     /* number of lvb bytes */
10210 +               len += DLM_LVB_LEN;
10211 +       }
10212 +
10213 +       if (lkb->lkb_range) {
10214 +               len += sizeof(uint64_t);
10215 +               len += sizeof(uint64_t);
10216 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
10217 +                       len += sizeof(uint64_t);
10218 +                       len += sizeof(uint64_t);
10219 +               }
10220 +       }
10221 +
10222 +       return len;
10223 +}
10224 +
10225 +/*
10226 + * It's up to the caller to be sure there's enough space in the buffer.
10227 + */
10228 +
10229 +static void serialise_lkb(gd_lkb_t *lkb, char *buf, int *offp)
10230 +{
10231 +       int flags;
10232 +
10233 +       /* Need to tell the remote end if we have a range */
10234 +       flags = lkb->lkb_flags;
10235 +       if (lkb->lkb_range)
10236 +               flags |= GDLM_LKFLG_RANGE;
10237 +
10238 +       /*
10239 +        * See lkb_length()
10240 +        * Total: 30 (no lvb) or 66 (with lvb) bytes
10241 +        */
10242 +
10243 +       put_int(lkb->lkb_id, buf, offp);
10244 +       put_int(lkb->lkb_resource->res_remasterid, buf, offp);
10245 +       put_int(flags, buf, offp);
10246 +       put_int(lkb->lkb_status, buf, offp);
10247 +       put_char(lkb->lkb_rqmode, buf, offp);
10248 +       put_char(lkb->lkb_grmode, buf, offp);
10249 +       put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
10250 +
10251 +       if (lkb->lkb_parent)
10252 +               put_int(lkb->lkb_parent->lkb_id, buf, offp);
10253 +       else
10254 +               put_int(0, buf, offp);
10255 +
10256 +       if (lkb->lkb_bastaddr)
10257 +               put_int(1, buf, offp);
10258 +       else
10259 +               put_int(0, buf, offp);
10260 +
10261 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10262 +               GDLM_ASSERT(lkb->lkb_lvbptr,);
10263 +               put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
10264 +       }
10265 +
10266 +       /* Only send the range we actually need */
10267 +       if (lkb->lkb_range) {
10268 +               switch (lkb->lkb_status) {
10269 +               case GDLM_LKSTS_CONVERT:
10270 +                       put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10271 +                       put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10272 +                       put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10273 +                       put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10274 +                       break;
10275 +               case GDLM_LKSTS_WAITING:
10276 +                       put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10277 +                       put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10278 +                       break;
10279 +               case GDLM_LKSTS_GRANTED:
10280 +                       put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10281 +                       put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10282 +                       break;
10283 +               default:
10284 +                       GDLM_ASSERT(0,);
10285 +               }
10286 +       }
10287 +}
10288 +
10289 +static int rsb_length(gd_res_t *rsb)
10290 +{
10291 +       int len = 0;
10292 +
10293 +       len += sizeof(int);     /* number of res_name bytes */
10294 +       len += rsb->res_length; /* res_name */
10295 +       len += sizeof(int);     /* res_remasterid */
10296 +       len += sizeof(int);     /* res_parent->res_remasterid */
10297 +
10298 +       return len;
10299 +}
10300 +
10301 +static inline gd_res_t *next_subrsb(gd_res_t *subrsb)
10302 +{
10303 +       struct list_head *tmp;
10304 +       gd_res_t *r;
10305 +
10306 +       tmp = subrsb->res_subreslist.next;
10307 +       r = list_entry(tmp, gd_res_t, res_subreslist);
10308 +
10309 +       return r;
10310 +}
10311 +
10312 +static inline int last_in_list(gd_res_t *r, struct list_head *head)
10313 +{
10314 +       gd_res_t *last = list_entry(head->prev, gd_res_t, res_subreslist);
10315 +
10316 +       if (last == r)
10317 +               return 1;
10318 +       return 0;
10319 +}
10320 +
10321 +/*
10322 + * Used to decide if an rsb should be rebuilt on a new master.  An rsb only
10323 + * needs to be rebuild if we have lkb's queued on it.  NOREBUILD lkb's on the
10324 + * wait queue are not rebuilt.
10325 + */
10326 +
10327 +static int lkbs_to_remaster(gd_res_t *r)
10328 +{
10329 +       gd_lkb_t *lkb;
10330 +       gd_res_t *sub;
10331 +
10332 +       if (!list_empty(&r->res_grantqueue) ||
10333 +           !list_empty(&r->res_convertqueue))
10334 +               return TRUE;
10335 +
10336 +       list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
10337 +               if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10338 +                       continue;
10339 +               return TRUE;
10340 +       }
10341 +
10342 +       list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
10343 +               if (!list_empty(&sub->res_grantqueue) ||
10344 +                   !list_empty(&sub->res_convertqueue))
10345 +                       return TRUE;
10346 +
10347 +               list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
10348 +                       if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10349 +                               continue;
10350 +                       return TRUE;
10351 +               }
10352 +       }
10353 +
10354 +       return FALSE;
10355 +}
10356 +
10357 +static void serialise_rsb(gd_res_t *rsb, char *buf, int *offp)
10358 +{
10359 +       /*
10360 +        * See rsb_length()
10361 +        * Total: 36 bytes (4 + 24 + 4 + 4)
10362 +        */
10363 +
10364 +       put_bytes(rsb->res_name, rsb->res_length, buf, offp);
10365 +       put_int(rsb->res_remasterid, buf, offp);
10366 +
10367 +       if (rsb->res_parent)
10368 +               put_int(rsb->res_parent->res_remasterid, buf, offp);
10369 +       else
10370 +               put_int(0, buf, offp);
10371 +
10372 +       GDLM_ASSERT(!rsb->res_lvbptr,);
10373 +}
10374 +
10375 +/*
10376 + * Flatten an LKB into a buffer for sending to the new RSB master.  As a
10377 + * side-effect the nodeid of the lock is set to the nodeid of the new RSB
10378 + * master.
10379 + */
10380 +
10381 +static int pack_one_lkb(gd_res_t *r, gd_lkb_t *lkb, rcom_fill_t *fill)
10382 +{
10383 +       if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
10384 +               goto nospace;
10385 +
10386 +       lkb->lkb_nodeid = r->res_nodeid;
10387 +
10388 +       put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
10389 +       serialise_lkb(lkb, fill->outbuf, &fill->offset);
10390 +
10391 +       fill->count++;
10392 +       need_new_lkid(r);
10393 +       return 0;
10394 +
10395 +      nospace:
10396 +       return -ENOSPC;
10397 +}
10398 +
10399 +/*
10400 + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
10401 + */
10402 +
10403 +static int pack_lkb_queue(gd_res_t *r, struct list_head *queue,
10404 +                         rcom_fill_t *fill)
10405 +{
10406 +       gd_lkb_t *lkb;
10407 +       int error;
10408 +
10409 +       list_for_each_entry(lkb, queue, lkb_statequeue) {
10410 +               if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10411 +                       continue;
10412 +
10413 +               error = pack_one_lkb(r, lkb, fill);
10414 +               if (error)
10415 +                       goto nospace;
10416 +       }
10417 +
10418 +       return 0;
10419 +
10420 +      nospace:
10421 +       fill->lkb = lkb;
10422 +       fill->lkbqueue = queue;
10423 +
10424 +       return error;
10425 +}
10426 +
10427 +static int pack_lkb_queues(gd_res_t *r, rcom_fill_t *fill)
10428 +{
10429 +       int error;
10430 +
10431 +       error = pack_lkb_queue(r, &r->res_grantqueue, fill);
10432 +       if (error)
10433 +               goto nospace;
10434 +
10435 +       error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10436 +       if (error)
10437 +               goto nospace;
10438 +
10439 +       error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10440 +
10441 +      nospace:
10442 +       return error;
10443 +}
10444 +
10445 +/*
10446 + * Pack remaining lkb's for rsb or subrsb.  This may include a partial lkb
10447 + * queue and full lkb queues.
10448 + */
10449 +
10450 +static int pack_lkb_remaining(gd_res_t *r, rcom_fill_t *fill)
10451 +{
10452 +       struct list_head *tmp, *start, *end;
10453 +       gd_lkb_t *lkb;
10454 +       int error;
10455 +
10456 +       /*
10457 +        * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
10458 +        */
10459 +
10460 +       error = pack_one_lkb(r, fill->lkb, fill);
10461 +       if (error)
10462 +               goto out;
10463 +
10464 +       start = fill->lkb->lkb_statequeue.next;
10465 +       end = fill->lkbqueue;
10466 +
10467 +       for (tmp = start; tmp != end; tmp = tmp->next) {
10468 +               lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
10469 +
10470 +               error = pack_one_lkb(r, lkb, fill);
10471 +               if (error) {
10472 +                       fill->lkb = lkb;
10473 +                       goto out;
10474 +               }
10475 +       }
10476 +
10477 +       /*
10478 +        * Pack all lkb's on r's queues following fill->lkbqueue.
10479 +        */
10480 +
10481 +       if (fill->lkbqueue == &r->res_waitqueue)
10482 +               goto out;
10483 +       if (fill->lkbqueue == &r->res_convertqueue)
10484 +               goto skip;
10485 +
10486 +       GDLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
10487 +
10488 +       error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10489 +       if (error)
10490 +               goto out;
10491 +      skip:
10492 +       error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10493 +
10494 +      out:
10495 +       return error;
10496 +}
10497 +
10498 +static int pack_one_subrsb(gd_res_t *rsb, gd_res_t *subrsb, rcom_fill_t *fill)
10499 +{
10500 +       int error;
10501 +
10502 +       down_write(&subrsb->res_lock);
10503 +
10504 +       if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
10505 +               goto nospace;
10506 +
10507 +       subrsb->res_nodeid = rsb->res_nodeid;
10508 +       subrsb->res_remasterid = ++fill->remasterid;
10509 +
10510 +       put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
10511 +       serialise_rsb(subrsb, fill->outbuf, &fill->offset);
10512 +
10513 +       error = pack_lkb_queues(subrsb, fill);
10514 +       if (error)
10515 +               goto nospace;
10516 +
10517 +       up_write(&subrsb->res_lock);
10518 +
10519 +       return 0;
10520 +
10521 +      nospace:
10522 +       up_write(&subrsb->res_lock);
10523 +       fill->subrsb = subrsb;
10524 +
10525 +       return -ENOSPC;
10526 +}
10527 +
10528 +static int pack_subrsbs(gd_res_t *rsb, gd_res_t *in_subrsb, rcom_fill_t *fill)
10529 +{
10530 +       gd_res_t *subrsb;
10531 +       int error = 0;
10532 +
10533 +       /*
10534 +        * When an initial subrsb is given, we know it needs to be packed.
10535 +        * When no initial subrsb is given, begin with the first (if any exist).
10536 +        */
10537 +
10538 +       if (!in_subrsb) {
10539 +               if (list_empty(&rsb->res_subreslist))
10540 +                       goto out;
10541 +
10542 +               subrsb = list_entry(rsb->res_subreslist.next, gd_res_t,
10543 +                                   res_subreslist);
10544 +       } else
10545 +               subrsb = in_subrsb;
10546 +
10547 +       for (;;) {
10548 +               error = pack_one_subrsb(rsb, subrsb, fill);
10549 +               if (error)
10550 +                       goto out;
10551 +
10552 +               if (last_in_list(subrsb, &rsb->res_subreslist))
10553 +                       break;
10554 +
10555 +               subrsb = next_subrsb(subrsb);
10556 +       }
10557 +
10558 +      out:
10559 +       return error;
10560 +}
10561 +
10562 +/*
10563 + * Finish packing whatever is left in an rsb tree.  If space runs out while
10564 + * finishing, save subrsb/lkb and this will be called again for the same rsb.
10565 + *
10566 + * !subrsb &&  lkb, we left off part way through root rsb's lkbs.
10567 + *  subrsb && !lkb, we left off just before starting a new subrsb.
10568 + *  subrsb &&  lkb, we left off part way through a subrsb's lkbs.
10569 + * !subrsb && !lkb, we shouldn't be in this function, but starting
10570 + *                  a new rsb in pack_rsb_tree().
10571 + */
10572 +
10573 +static int pack_rsb_tree_remaining(gd_ls_t *ls, gd_res_t *rsb,
10574 +                                  rcom_fill_t *fill)
10575 +{
10576 +       gd_res_t *subrsb = NULL;
10577 +       int error = 0;
10578 +
10579 +       if (!fill->subrsb && fill->lkb) {
10580 +               error = pack_lkb_remaining(rsb, fill);
10581 +               if (error)
10582 +                       goto out;
10583 +
10584 +               error = pack_subrsbs(rsb, NULL, fill);
10585 +               if (error)
10586 +                       goto out;
10587 +       }
10588 +
10589 +       else if (fill->subrsb && !fill->lkb) {
10590 +               error = pack_subrsbs(rsb, fill->subrsb, fill);
10591 +               if (error)
10592 +                       goto out;
10593 +       }
10594 +
10595 +       else if (fill->subrsb && fill->lkb) {
10596 +               error = pack_lkb_remaining(fill->subrsb, fill);
10597 +               if (error)
10598 +                       goto out;
10599 +
10600 +               if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
10601 +                       goto out;
10602 +
10603 +               subrsb = next_subrsb(fill->subrsb);
10604 +
10605 +               error = pack_subrsbs(rsb, subrsb, fill);
10606 +               if (error)
10607 +                       goto out;
10608 +       }
10609 +
10610 +       fill->subrsb = NULL;
10611 +       fill->lkb = NULL;
10612 +
10613 +      out:
10614 +       return error;
10615 +}
10616 +
10617 +/*
10618 + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
10619 + * buffer.  When the buffer runs out of space, save the place to restart (the
10620 + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
10621 + */
10622 +
10623 +static int pack_rsb_tree(gd_ls_t *ls, gd_res_t *rsb, rcom_fill_t *fill)
10624 +{
10625 +       int error = -ENOSPC;
10626 +
10627 +       fill->remasterid = 0;
10628 +
10629 +       /*
10630 +        * Pack the root rsb itself.  A 1 byte type precedes the serialised
10631 +        * rsb.  Then pack the lkb's for the root rsb.
10632 +        */
10633 +
10634 +       down_write(&rsb->res_lock);
10635 +
10636 +       if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
10637 +               goto out;
10638 +
10639 +       rsb->res_remasterid = ++fill->remasterid;
10640 +       put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
10641 +       serialise_rsb(rsb, fill->outbuf, &fill->offset);
10642 +
10643 +       error = pack_lkb_queues(rsb, fill);
10644 +       if (error)
10645 +               goto out;
10646 +
10647 +       up_write(&rsb->res_lock);
10648 +
10649 +       /*
10650 +        * Pack subrsb/lkb's under the root rsb.
10651 +        */
10652 +
10653 +       error = pack_subrsbs(rsb, NULL, fill);
10654 +
10655 +       return error;
10656 +
10657 +      out:
10658 +       up_write(&rsb->res_lock);
10659 +       return error;
10660 +}
10661 +
10662 +/*
10663 + * Given an RSB, return the next RSB that should be sent to a new master.
10664 + */
10665 +
10666 +static gd_res_t *next_remastered_rsb(gd_ls_t *ls, gd_res_t *rsb)
10667 +{
10668 +       struct list_head *tmp, *start, *end;
10669 +       gd_res_t *r;
10670 +
10671 +       if (!rsb)
10672 +               start = ls->ls_rootres.next;
10673 +       else
10674 +               start = rsb->res_rootlist.next;
10675 +
10676 +       end = &ls->ls_rootres;
10677 +
10678 +       for (tmp = start; tmp != end; tmp = tmp->next) {
10679 +               r = list_entry(tmp, gd_res_t, res_rootlist);
10680 +
10681 +               if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
10682 +                       if (r->res_nodeid && lkbs_to_remaster(r)) {
10683 +                               expect_new_lkids(r);
10684 +                               return r;
10685 +                       } else
10686 +                               clear_bit(RESFL_NEW_MASTER, &r->res_flags);
10687 +               }
10688 +       }
10689 +
10690 +       return NULL;
10691 +}
10692 +
10693 +/*
10694 + * Given an rcom buffer, fill it with RSB's that need to be sent to a single
10695 + * new master node.  In the case where all the data to send to one node
10696 + * requires multiple messages, this function needs to resume filling each
10697 + * successive buffer from the point where it left off when the previous buffer
10698 + * filled up.
10699 + */
10700 +
10701 +static void fill_rcom_buffer(gd_ls_t *ls, rcom_fill_t *fill, uint32_t *nodeid)
10702 +{
10703 +       gd_res_t *rsb, *prev_rsb = fill->rsb;
10704 +       int error;
10705 +
10706 +       fill->offset = 0;
10707 +
10708 +       if (!prev_rsb) {
10709 +
10710 +               /*
10711 +                * The first time this function is called.
10712 +                */
10713 +
10714 +               rsb = next_remastered_rsb(ls, NULL);
10715 +               if (!rsb)
10716 +                       goto no_more;
10717 +
10718 +       } else if (fill->subrsb || fill->lkb) {
10719 +
10720 +               /*
10721 +                * Continue packing an rsb tree that was partially packed last
10722 +                * time (fill->subrsb/lkb indicates where packing of last block
10723 +                * left off)
10724 +                */
10725 +
10726 +               rsb = prev_rsb;
10727 +               *nodeid = rsb->res_nodeid;
10728 +
10729 +               error = pack_rsb_tree_remaining(ls, rsb, fill);
10730 +               if (error == -ENOSPC)
10731 +                       goto more;
10732 +
10733 +               rsb = next_remastered_rsb(ls, prev_rsb);
10734 +               if (!rsb)
10735 +                       goto no_more;
10736 +
10737 +               if (rsb->res_nodeid != prev_rsb->res_nodeid)
10738 +                       goto more;
10739 +       } else {
10740 +               rsb = prev_rsb;
10741 +       }
10742 +
10743 +       /*
10744 +        * Pack rsb trees into the buffer until we run out of space, run out of
10745 +        * new rsb's or hit a new nodeid.
10746 +        */
10747 +
10748 +       *nodeid = rsb->res_nodeid;
10749 +
10750 +       for (;;) {
10751 +               error = pack_rsb_tree(ls, rsb, fill);
10752 +               if (error == -ENOSPC)
10753 +                       goto more;
10754 +
10755 +               prev_rsb = rsb;
10756 +
10757 +               rsb = next_remastered_rsb(ls, prev_rsb);
10758 +               if (!rsb)
10759 +                       goto no_more;
10760 +
10761 +               if (rsb->res_nodeid != prev_rsb->res_nodeid)
10762 +                       goto more;
10763 +       }
10764 +
10765 +      more:
10766 +       fill->more = 1;
10767 +       fill->rsb = rsb;
10768 +       return;
10769 +
10770 +      no_more:
10771 +       fill->more = 0;
10772 +}
10773 +
10774 +/*
10775 + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
10776 + */
10777 +
10778 +int rebuild_rsbs_send(gd_ls_t *ls)
10779 +{
10780 +       gd_rcom_t *rc;
10781 +       rcom_fill_t fill;
10782 +       uint32_t nodeid;
10783 +       int error;
10784 +
10785 +       GDLM_ASSERT(recover_list_empty(ls),);
10786 +
10787 +       log_all(ls, "rebuild locks");
10788 +
10789 +       error = -ENOMEM;
10790 +       rc = allocate_rcom_buffer(ls);
10791 +       if (!rc)
10792 +               goto ret;
10793 +
10794 +       error = 0;
10795 +       memset(&fill, 0, sizeof(rcom_fill_t));
10796 +       fill.outbuf = rc->rc_buf;
10797 +       fill.maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
10798 +
10799 +       do {
10800 +               fill_rcom_buffer(ls, &fill, &nodeid);
10801 +               if (!fill.offset)
10802 +                       break;
10803 +
10804 +               rc->rc_datalen = fill.offset;
10805 +               error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
10806 +               if (error)
10807 +                       goto out;
10808 +
10809 +               schedule();
10810 +               error = gdlm_recovery_stopped(ls);
10811 +               if (error)
10812 +                       goto out;
10813 +       }
10814 +       while (fill.more);
10815 +
10816 +       error = gdlm_wait_function(ls, &recover_list_empty);
10817 +
10818 +       log_all(ls, "rebuilt %d locks", fill.count);
10819 +
10820 +      out:
10821 +       rebuild_freemem(ls);
10822 +       free_rcom_buffer(rc);
10823 +
10824 +      ret:
10825 +       return error;
10826 +}
10827 +
10828 +static gd_res_t *find_by_remasterid(gd_ls_t *ls, int remasterid,
10829 +                                   gd_res_t *rootrsb)
10830 +{
10831 +       gd_res_t *rsb;
10832 +
10833 +       GDLM_ASSERT(rootrsb,);
10834 +
10835 +       if (rootrsb->res_remasterid == remasterid) {
10836 +               rsb = rootrsb;
10837 +               goto out;
10838 +       }
10839 +
10840 +       list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10841 +               if (rsb->res_remasterid == remasterid)
10842 +                       goto out;
10843 +       }
10844 +       rsb = NULL;
10845 +
10846 +      out:
10847 +       return rsb;
10848 +}
10849 +
10850 +/*
10851 + * Search a queue for the given remote lock id (remlkid).
10852 + */
10853 +
10854 +static gd_lkb_t *search_remlkid(struct list_head *statequeue, int nodeid,
10855 +                               int remid)
10856 +{
10857 +       gd_lkb_t *lkb;
10858 +
10859 +       list_for_each_entry(lkb, statequeue, lkb_statequeue) {
10860 +               if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
10861 +                       return lkb;
10862 +               }
10863 +       }
10864 +
10865 +       return NULL;
10866 +}
10867 +
10868 +/*
10869 + * Given a remote lock ID (and a parent resource), return the local LKB for it
10870 + * Hopefully we dont need to do this too often on deep lock trees.  This is
10871 + * VERY suboptimal for anything but the smallest lock trees. It searches the
10872 + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
10873 + * returns the LKB address.  OPTIMISATION: we should keep a list of these while
10874 + * we are building up the remastered LKBs
10875 + */
10876 +
10877 +static gd_lkb_t *find_by_remlkid(gd_res_t *rootrsb, int nodeid, int remid)
10878 +{
10879 +       gd_lkb_t *lkb;
10880 +       gd_res_t *rsb;
10881 +
10882 +       lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
10883 +       if (lkb)
10884 +               goto out;
10885 +
10886 +       lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
10887 +       if (lkb)
10888 +               goto out;
10889 +
10890 +       lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
10891 +       if (lkb)
10892 +               goto out;
10893 +
10894 +       list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10895 +               lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
10896 +               if (lkb)
10897 +                       goto out;
10898 +
10899 +               lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
10900 +               if (lkb)
10901 +                       goto out;
10902 +
10903 +               lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
10904 +               if (lkb)
10905 +                       goto out;
10906 +       }
10907 +       lkb = NULL;
10908 +
10909 +      out:
10910 +       return lkb;
10911 +}
10912 +
10913 +/*
10914 + * Unpack an LKB from a remaster operation
10915 + */
10916 +
10917 +static int deserialise_lkb(gd_ls_t *ls, int rem_nodeid, gd_res_t *rootrsb,
10918 +                          char *buf, int *ptr, char *outbuf, int *outoffp)
10919 +{
10920 +       gd_lkb_t *lkb;
10921 +       gd_res_t *rsb;
10922 +       int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
10923 +
10924 +       remote_lkid = get_int(buf, ptr);
10925 +
10926 +       rsb_rmid = get_int(buf, ptr);
10927 +       rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
10928 +       GDLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
10929 +
10930 +       /*
10931 +        * We could have received this lkb already from a previous recovery
10932 +        * that was interrupted.  If so, just return the lkid to the remote
10933 +        * node.
10934 +        */
10935 +       lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
10936 +       if (lkb)
10937 +               goto put_lkid;
10938 +
10939 +       lkb = create_lkb(rsb->res_ls);
10940 +       if (!lkb)
10941 +               goto out;
10942 +
10943 +       lkb->lkb_remid = remote_lkid;
10944 +       lkb->lkb_flags = get_int(buf, ptr);
10945 +       status = get_int(buf, ptr);
10946 +       lkb->lkb_rqmode = get_char(buf, ptr);
10947 +       lkb->lkb_grmode = get_char(buf, ptr);
10948 +       atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
10949 +
10950 +       parentid = get_int(buf, ptr);
10951 +       lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
10952 +
10953 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10954 +               lkb->lkb_lvbptr = allocate_lvb(ls);
10955 +               if (!lkb->lkb_lvbptr)
10956 +                       goto out;
10957 +               get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
10958 +       }
10959 +
10960 +       if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
10961 +               uint64_t start, end;
10962 +
10963 +               /* Don't need to keep the range flag, for comms use only */
10964 +               lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
10965 +               start = get_int64(buf, ptr);
10966 +               end = get_int64(buf, ptr);
10967 +
10968 +               lkb->lkb_range = allocate_range(rsb->res_ls);
10969 +               if (!lkb->lkb_range)
10970 +                       goto out;
10971 +
10972 +               switch (status) {
10973 +               case GDLM_LKSTS_CONVERT:
10974 +                       lkb->lkb_range[RQ_RANGE_START] = start;
10975 +                       lkb->lkb_range[RQ_RANGE_END] = end;
10976 +                       start = get_int64(buf, ptr);
10977 +                       end = get_int64(buf, ptr);
10978 +                       lkb->lkb_range[GR_RANGE_START] = start;
10979 +                       lkb->lkb_range[GR_RANGE_END] = end;
10980 +
10981 +               case GDLM_LKSTS_WAITING:
10982 +                       lkb->lkb_range[RQ_RANGE_START] = start;
10983 +                       lkb->lkb_range[RQ_RANGE_END] = end;
10984 +                       break;
10985 +
10986 +               case GDLM_LKSTS_GRANTED:
10987 +                       lkb->lkb_range[GR_RANGE_START] = start;
10988 +                       lkb->lkb_range[GR_RANGE_END] = end;
10989 +                       break;
10990 +               default:
10991 +                       GDLM_ASSERT(0,);
10992 +               }
10993 +       }
10994 +
10995 +       /* Resolve local lock LKB address from parent ID */
10996 +       if (parentid)
10997 +               lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
10998 +                                                 parentid);
10999 +
11000 +       atomic_inc(&rsb->res_ref);
11001 +       lkb->lkb_resource = rsb;
11002 +
11003 +       lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
11004 +       lkb->lkb_nodeid = rem_nodeid;
11005 +
11006 +       /*
11007 +        * Put the lkb on an RSB queue.  An lkb that's in the midst of a
11008 +        * conversion request (on the requesting node's lockqueue and has
11009 +        * LQCONVERT set) should be put on the granted queue.  The convert
11010 +        * request will be resent by the requesting node.
11011 +        */
11012 +
11013 +       if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
11014 +               lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
11015 +               GDLM_ASSERT(status == GDLM_LKSTS_CONVERT,
11016 +                           printk("status=%d\n", status););
11017 +               lkb->lkb_rqmode = DLM_LOCK_IV;
11018 +               status = GDLM_LKSTS_GRANTED;
11019 +       }
11020 +
11021 +       lkb_enqueue(rsb, lkb, status);
11022 +
11023 +       /*
11024 +        * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
11025 +        */
11026 +
11027 +       if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
11028 +           && lkb->lkb_grmode > DLM_LOCK_NL) {
11029 +               if (!rsb->res_lvbptr)
11030 +                       rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
11031 +               if (!rsb->res_lvbptr)
11032 +                       goto out;
11033 +               memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11034 +       }
11035 +
11036 +       /*
11037 +        * Clear flags that may have been sent over that are only relevant in
11038 +        * the context of the sender.
11039 +        */
11040 +
11041 +       lkb->lkb_flags &= ~(GDLM_LKFLG_DELAST | GDLM_LKFLG_DELETED |
11042 +                           GDLM_LKFLG_LQRESEND | GDLM_LKFLG_NOREBUILD |
11043 +                           GDLM_LKFLG_DEMOTED);
11044 +
11045 +      put_lkid:
11046 +       /* Return the new LKID to the caller's buffer */
11047 +       put_int(lkb->lkb_id, outbuf, outoffp);
11048 +       put_int(lkb->lkb_remid, outbuf, outoffp);
11049 +       error = 0;
11050 +
11051 +      out:
11052 +       return error;
11053 +}
11054 +
11055 +static gd_res_t *deserialise_rsb(gd_ls_t *ls, int nodeid, gd_res_t *rootrsb,
11056 +                                char *buf, int *ptr)
11057 +{
11058 +       int length;
11059 +       int remasterid;
11060 +       int parent_remasterid;
11061 +       char name[DLM_RESNAME_MAXLEN];
11062 +       int error;
11063 +       gd_res_t *parent = NULL;
11064 +       gd_res_t *rsb;
11065 +
11066 +       get_bytes(name, &length, buf, ptr);
11067 +       remasterid = get_int(buf, ptr);
11068 +       parent_remasterid = get_int(buf, ptr);
11069 +
11070 +       if (parent_remasterid)
11071 +               parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11072 +
11073 +       /*
11074 +        * The rsb reference from this find_or_create_rsb() will keep the rsb
11075 +        * around while we add new lkb's to it from deserialise_lkb.  Each of
11076 +        * the lkb's will add an rsb reference.  The reference added here is
11077 +        * removed by release_rsb() after all lkb's are added.
11078 +        */
11079 +
11080 +       error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
11081 +       GDLM_ASSERT(!error,);
11082 +
11083 +       /* There is a case where the above needs to create the RSB. */
11084 +       if (rsb->res_nodeid == -1)
11085 +               rsb->res_nodeid = our_nodeid();
11086 +
11087 +       rsb->res_remasterid = remasterid;
11088 +
11089 +       return rsb;
11090 +}
11091 +
11092 +/*
11093 + * Processing at the receiving end of a NEWLOCKS message from a node in
11094 + * rebuild_rsbs_send().  Rebuild a remastered lock tree.  Nodeid is the remote
11095 + * node whose locks we are now mastering.  For a reply we need to send back the
11096 + * new lockids of the remastered locks so that remote ops can find them.
11097 + */
11098 +
11099 +int rebuild_rsbs_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
11100 +{
11101 +       gd_rcom_t *rc;
11102 +       gd_res_t *rsb = NULL;
11103 +       rebuild_node_t *rnode;
11104 +       char *outbuf;
11105 +       int outptr, ptr = 0, error = -ENOMEM;
11106 +
11107 +       rnode = find_rebuild_root(ls, nodeid);
11108 +       if (!rnode)
11109 +               goto out;
11110 +
11111 +       /*
11112 +        * Allocate a buffer for the reply message which is a list of remote
11113 +        * lock IDs and their (new) local lock ids.  It will always be big
11114 +        * enough to fit <n> ID pairs if it already fit <n> LKBs.
11115 +        */
11116 +
11117 +       rc = allocate_rcom_buffer(ls);
11118 +       if (!rc)
11119 +               goto out;
11120 +       outbuf = rc->rc_buf;
11121 +       outptr = 0;
11122 +
11123 +       /*
11124 +        * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
11125 +        * created.  Each deserialise_rsb adds an rsb reference that must be
11126 +        * removed with release_rsb once all new lkb's for an rsb have been
11127 +        * added.
11128 +        */
11129 +
11130 +       while (ptr < len) {
11131 +               int type;
11132 +
11133 +               type = get_char(buf, &ptr);
11134 +
11135 +               switch (type) {
11136 +               case REMASTER_ROOTRSB:
11137 +                       if (rsb)
11138 +                               release_rsb(rsb);
11139 +                       rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11140 +                                             &ptr);
11141 +                       rnode->rootrsb = rsb;
11142 +                       break;
11143 +
11144 +               case REMASTER_RSB:
11145 +                       if (rsb)
11146 +                               release_rsb(rsb);
11147 +                       rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11148 +                                             &ptr);
11149 +                       break;
11150 +
11151 +               case REMASTER_LKB:
11152 +                       deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
11153 +                                       outbuf, &outptr);
11154 +                       break;
11155 +
11156 +               default:
11157 +                       GDLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
11158 +                                             "len=%d\n", type, nodeid, ptr,
11159 +                                             len););
11160 +               }
11161 +       }
11162 +
11163 +       if (rsb)
11164 +               release_rsb(rsb);
11165 +
11166 +       /*
11167 +        * Reply with the new lock IDs.
11168 +        */
11169 +
11170 +       rc->rc_datalen = outptr;
11171 +       error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
11172 +
11173 +       free_rcom_buffer(rc);
11174 +
11175 +      out:
11176 +       return error;
11177 +}
11178 +
11179 +/*
11180 + * Processing for a NEWLOCKIDS message.  Called when we get the reply from the
11181 + * new master telling us what the new remote lock IDs are for the remastered
11182 + * locks
11183 + */
11184 +
11185 +int rebuild_rsbs_lkids_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
11186 +{
11187 +       int offset = 0;
11188 +
11189 +       if (len == 1)
11190 +               len = 0;
11191 +
11192 +       while (offset < len) {
11193 +               int remote_id;
11194 +               int local_id;
11195 +               gd_lkb_t *lkb;
11196 +
11197 +               if (offset + 8 > len) {
11198 +                       log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
11199 +                                 "length nodeid=%d offset=%d len=%d",
11200 +                                 nodeid, offset, len);
11201 +                       break;
11202 +               }
11203 +
11204 +               remote_id = get_int(buf, &offset);
11205 +               local_id = get_int(buf, &offset);
11206 +
11207 +               lkb = find_lock_by_id(ls, local_id);
11208 +               if (lkb) {
11209 +                       lkb->lkb_remid = remote_id;
11210 +                       have_new_lkid(lkb);
11211 +               } else {
11212 +                       log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
11213 +                                 "nodeid=%d id=%x remid=%x offset=%d len=%d",
11214 +                                 nodeid, local_id, remote_id, offset, len);
11215 +               }
11216 +       }
11217 +
11218 +       if (recover_list_empty(ls))
11219 +               wake_up(&ls->ls_wait_general);
11220 +
11221 +       return 0;
11222 +}
11223 diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
11224 --- linux-orig/cluster/dlm/rebuild.h    1970-01-01 07:30:00.000000000 +0730
11225 +++ linux-patched/cluster/dlm/rebuild.h 2004-06-25 18:31:07.000000000 +0800
11226 @@ -0,0 +1,22 @@
11227 +/******************************************************************************
11228 +*******************************************************************************
11229 +**
11230 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11231 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11232 +**
11233 +**  This copyrighted material is made available to anyone wishing to use,
11234 +**  modify, copy, or redistribute it subject to the terms and conditions
11235 +**  of the GNU General Public License v.2.
11236 +**
11237 +*******************************************************************************
11238 +******************************************************************************/
11239 +
11240 +#ifndef __REBUILD_DOT_H__
11241 +#define __REBUILD_DOT_H__
11242 +
11243 +int rebuild_rsbs_send(gd_ls_t * ls);
11244 +int rebuild_rsbs_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
11245 +int rebuild_rsbs_lkids_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
11246 +int rebuild_freemem(gd_ls_t * ls);
11247 +
11248 +#endif                         /* __REBUILD_DOT_H__ */
11249 diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
11250 --- linux-orig/cluster/dlm/reccomms.c   1970-01-01 07:30:00.000000000 +0730
11251 +++ linux-patched/cluster/dlm/reccomms.c        2004-06-25 18:31:07.000000000 +0800
11252 @@ -0,0 +1,502 @@
11253 +/******************************************************************************
11254 +*******************************************************************************
11255 +**
11256 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11257 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11258 +**
11259 +**  This copyrighted material is made available to anyone wishing to use,
11260 +**  modify, copy, or redistribute it subject to the terms and conditions
11261 +**  of the GNU General Public License v.2.
11262 +**
11263 +*******************************************************************************
11264 +******************************************************************************/
11265 +
11266 +#include "dlm_internal.h"
11267 +#include "lowcomms.h"
11268 +#include "midcomms.h"
11269 +#include "reccomms.h"
11270 +#include "nodes.h"
11271 +#include "lockspace.h"
11272 +#include "recover.h"
11273 +#include "dir.h"
11274 +#include "config.h"
11275 +#include "rebuild.h"
11276 +#include "memory.h"
11277 +
11278 +/* Running on the basis that only a single recovery communication will be done
11279 + * at a time per lockspace */
11280 +
11281 +static void rcom_process_message(gd_ls_t * ls, uint32_t nodeid, gd_rcom_t * rc);
11282 +
11283 +/*
11284 + * Track per-node progress/stats during recovery to help debugging.
11285 + */
11286 +
11287 +void rcom_log(gd_ls_t *ls, int nodeid, gd_rcom_t *rc, int send)
11288 +{
11289 +       gd_csb_t *csb;
11290 +       int found = 0;
11291 +
11292 +       list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11293 +               if (csb->csb_node->gn_nodeid == nodeid) {
11294 +                       found = TRUE;
11295 +                       break;
11296 +               }
11297 +       }
11298 +
11299 +       if (!found)
11300 +               return;
11301 +
11302 +       if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
11303 +               if (send) {
11304 +                       csb->csb_names_send_count++;
11305 +                       csb->csb_names_send_msgid = rc->rc_msgid;
11306 +               } else {
11307 +                       csb->csb_names_recv_count++;
11308 +                       csb->csb_names_recv_msgid = rc->rc_msgid;
11309 +               }
11310 +       } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
11311 +               if (send) {
11312 +                       csb->csb_locks_send_count++;
11313 +                       csb->csb_locks_send_msgid = rc->rc_msgid;
11314 +               } else {
11315 +                       csb->csb_locks_recv_count++;
11316 +                       csb->csb_locks_recv_msgid = rc->rc_msgid;
11317 +               }
11318 +       }
11319 +}
11320 +
11321 +void rcom_log_clear(gd_ls_t *ls)
11322 +{
11323 +       gd_csb_t *csb;
11324 +
11325 +       list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11326 +               csb->csb_names_send_count = 0;
11327 +               csb->csb_names_send_msgid = 0;
11328 +               csb->csb_names_recv_count = 0;
11329 +               csb->csb_names_recv_msgid = 0;
11330 +               csb->csb_locks_send_count = 0;
11331 +               csb->csb_locks_send_msgid = 0;
11332 +               csb->csb_locks_recv_count = 0;
11333 +               csb->csb_locks_recv_msgid = 0;
11334 +       }
11335 +}
11336 +
11337 +static int rcom_response(gd_ls_t *ls)
11338 +{
11339 +       return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11340 +}
11341 +
11342 +/**
11343 + * rcom_send_message - send or request recovery data
11344 + * @ls: the lockspace
11345 + * @nodeid: node to which the message is sent
11346 + * @type: type of recovery message
11347 + * @rc: the rc buffer to send
11348 + * @need_reply: wait for reply if this is set
11349 + *
11350 + * Using this interface
11351 + * i)   Allocate an rc buffer:
11352 + *          rc = allocate_rcom_buffer(ls);
11353 + * ii)  Copy data to send beginning at rc->rc_buf:
11354 + *          memcpy(rc->rc_buf, mybuf, mylen);
11355 + * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
11356 + *          rc->rc_datalen = mylen
11357 + * iv)  Submit the rc to this function:
11358 + *          rcom_send_message(rc);
11359 + *
11360 + * The max value of "mylen" is dlm_config.buffer_size - sizeof(gd_rcom_t).  If
11361 + * more data must be passed in one send, use rcom_expand_buffer() which
11362 + * incrementally increases the size of the rc buffer by dlm_config.buffer_size
11363 + * bytes.
11364 + *
11365 + * Any data returned for the message (when need_reply is set) will saved in
11366 + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
11367 + * number of bytes copied into rc->rc_buf.
11368 + *
11369 + * Returns: 0 on success, -EXXX on failure
11370 + */
11371 +
11372 +int rcom_send_message(gd_ls_t *ls, uint32_t nodeid, int type, gd_rcom_t *rc,
11373 +                     int need_reply)
11374 +{
11375 +       int error = 0;
11376 +
11377 +       if (!rc->rc_datalen)
11378 +               rc->rc_datalen = 1;
11379 +
11380 +       /*
11381 +        * Fill in the header.
11382 +        */
11383 +
11384 +       rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
11385 +       rc->rc_header.rh_lockspace = ls->ls_global_id;
11386 +       rc->rc_header.rh_length = sizeof(gd_rcom_t) + rc->rc_datalen - 1;
11387 +       rc->rc_subcmd = type;
11388 +       rc->rc_msgid = ++ls->ls_rcom_msgid;
11389 +
11390 +       rcom_log(ls, nodeid, rc, 1);
11391 +
11392 +       /*
11393 +        * When a reply is received, the reply data goes back into this buffer.
11394 +        * Synchronous rcom requests (need_reply=1) are serialised because of
11395 +        * the single ls_rcom.
11396 +        */
11397 +
11398 +       if (need_reply) {
11399 +               down(&ls->ls_rcom_lock);
11400 +               ls->ls_rcom = rc;
11401 +       }
11402 +
11403 +       /*
11404 +        * After sending the message we'll wait at the end of this function to
11405 +        * get a reply.  The READY flag will be set when the reply has been
11406 +        * received and requested data has been copied into
11407 +        * ls->ls_rcom->rc_buf;
11408 +        */
11409 +
11410 +       GDLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
11411 +
11412 +       /*
11413 +        * The WAIT bit indicates that we're waiting for and willing to accept a
11414 +        * reply.  Any replies are ignored unless this bit is set.
11415 +        */
11416 +
11417 +       set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11418 +
11419 +       /*
11420 +        * Process the message locally.
11421 +        */
11422 +
11423 +       if (nodeid == our_nodeid()) {
11424 +               rcom_process_message(ls, nodeid, rc);
11425 +               goto out;
11426 +       }
11427 +
11428 +       /*
11429 +        * Send the message.
11430 +        */
11431 +
11432 +       log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
11433 +
11434 +       error = midcomms_send_message(nodeid, (struct gd_req_header *) rc,
11435 +                                     GFP_KERNEL);
11436 +       GDLM_ASSERT(error >= 0, printk("error = %d\n", error););
11437 +       error = 0;
11438 +
11439 +       /*
11440 +        * Wait for a reply.  Once a reply is processed from midcomms, the
11441 +        * READY bit will be set and we'll be awoken (gdlm_wait_function will
11442 +        * return 0).
11443 +        */
11444 +
11445 +       if (need_reply) {
11446 +               error = gdlm_wait_function(ls, &rcom_response);
11447 +               if (error)
11448 +                       log_debug(ls, "rcom wait error %d", error);
11449 +       }
11450 +
11451 +      out:
11452 +       clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11453 +       clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11454 +
11455 +       if (need_reply)
11456 +               up(&ls->ls_rcom_lock);
11457 +
11458 +       return error;
11459 +}
11460 +
11461 +/*
11462 + * Runs in same context as midcomms.
11463 + */
11464 +
11465 +static void rcom_process_message(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *rc)
11466 +{
11467 +       gd_rcom_t rc_stack;
11468 +       gd_rcom_t *reply = NULL;
11469 +       gd_resdata_t *rd;
11470 +       int status, datalen, maxlen;
11471 +       uint32_t be_nodeid;
11472 +
11473 +       if (!ls)
11474 +               return;
11475 +
11476 +       rcom_log(ls, nodeid, rc, 0);
11477 +
11478 +       if (gdlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
11479 +               log_error(ls, "ignoring recovery message %x from %u",
11480 +                         rc->rc_subcmd, nodeid);
11481 +               return;
11482 +       }
11483 +
11484 +       switch (rc->rc_subcmd) {
11485 +
11486 +       case RECCOMM_STATUS:
11487 +
11488 +               memset(&rc_stack, 0, sizeof(gd_rcom_t));
11489 +               reply = &rc_stack;
11490 +
11491 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11492 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11493 +               reply->rc_subcmd = rc->rc_subcmd;
11494 +               reply->rc_msgid = rc->rc_msgid;
11495 +               reply->rc_buf[0] = 0;
11496 +
11497 +               if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
11498 +                       reply->rc_buf[0] |= RESDIR_VALID;
11499 +
11500 +               if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
11501 +                       reply->rc_buf[0] |= RESDIR_ALL_VALID;
11502 +
11503 +               if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
11504 +                       reply->rc_buf[0] |= NODES_VALID;
11505 +
11506 +               if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
11507 +                       reply->rc_buf[0] |= NODES_ALL_VALID;
11508 +
11509 +               reply->rc_datalen = 1;
11510 +               reply->rc_header.rh_length =
11511 +                       sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11512 +
11513 +               log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
11514 +               break;
11515 +
11516 +       case RECCOMM_RECOVERNAMES:
11517 +
11518 +               reply = allocate_rcom_buffer(ls);
11519 +               GDLM_ASSERT(reply,);
11520 +               maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
11521 +
11522 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11523 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11524 +               reply->rc_subcmd = rc->rc_subcmd;
11525 +               reply->rc_msgid = rc->rc_msgid;
11526 +
11527 +               /*
11528 +                * The other node wants a bunch of resource names.  The name of
11529 +                * the resource to begin with is in rc->rc_buf.
11530 +                */
11531 +
11532 +               datalen = resdir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
11533 +                                             reply->rc_buf, maxlen, nodeid);
11534 +
11535 +               reply->rc_datalen = datalen;
11536 +               reply->rc_header.rh_length =
11537 +                   sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11538 +
11539 +               log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
11540 +                         reply->rc_msgid);
11541 +               break;
11542 +
11543 +       case RECCOMM_GETMASTER:
11544 +
11545 +               reply = allocate_rcom_buffer(ls);
11546 +               GDLM_ASSERT(reply,);
11547 +
11548 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11549 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11550 +               reply->rc_subcmd = rc->rc_subcmd;
11551 +               reply->rc_msgid = rc->rc_msgid;
11552 +
11553 +               /*
11554 +                * The other node wants to know the master of a named resource.
11555 +                */
11556 +
11557 +               status = get_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen,
11558 +                                    &rd, 1);
11559 +               if (status != 0) {
11560 +                       free_rcom_buffer(reply);
11561 +                       reply = NULL;
11562 +                       return;
11563 +               }
11564 +               be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
11565 +               memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
11566 +               reply->rc_datalen = sizeof(uint32_t);
11567 +               reply->rc_header.rh_length =
11568 +                   sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11569 +               break;
11570 +
11571 +       case RECCOMM_BULKLOOKUP:
11572 +
11573 +               reply = allocate_rcom_buffer(ls);
11574 +               GDLM_ASSERT(reply,);
11575 +
11576 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11577 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11578 +               reply->rc_subcmd = rc->rc_subcmd;
11579 +               reply->rc_msgid = rc->rc_msgid;
11580 +
11581 +               /*
11582 +                * This is a bulk version of the above and just returns a
11583 +                * buffer full of node ids to match the resources
11584 +                */
11585 +
11586 +               datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
11587 +                                            rc->rc_datalen, reply->rc_buf);
11588 +               if (datalen < 0) {
11589 +                       free_rcom_buffer(reply);
11590 +                       reply = NULL;
11591 +                       return;
11592 +               }
11593 +
11594 +               reply->rc_datalen = datalen;
11595 +               reply->rc_header.rh_length =
11596 +                   sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11597 +               break;
11598 +
11599 +               /*
11600 +                * These RECCOMM messages don't need replies.
11601 +                */
11602 +
11603 +       case RECCOMM_NEWLOCKS:
11604 +               rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11605 +               break;
11606 +
11607 +       case RECCOMM_NEWLOCKIDS:
11608 +               rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11609 +               break;
11610 +
11611 +       case RECCOMM_REMRESDATA:
11612 +               remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, 1);
11613 +               break;
11614 +
11615 +       default:
11616 +               GDLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
11617 +       }
11618 +
11619 +       if (reply) {
11620 +               if (nodeid == our_nodeid()) {
11621 +                       GDLM_ASSERT(rc == ls->ls_rcom,);
11622 +                       memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11623 +                       rc->rc_datalen = reply->rc_datalen;
11624 +               } else {
11625 +                       midcomms_send_message(nodeid,
11626 +                                             (struct gd_req_header *) reply,
11627 +                                             GFP_KERNEL);
11628 +               }
11629 +
11630 +               if (reply != &rc_stack)
11631 +                       free_rcom_buffer(reply);
11632 +       }
11633 +}
11634 +
11635 +static void process_reply_sync(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11636 +{
11637 +       gd_rcom_t *rc = ls->ls_rcom;
11638 +
11639 +       if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
11640 +               log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
11641 +               return;
11642 +       }
11643 +
11644 +       if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
11645 +               log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
11646 +                         reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
11647 +               return;
11648 +       }
11649 +
11650 +       memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11651 +       rc->rc_datalen = reply->rc_datalen;
11652 +
11653 +       /*
11654 +        * Tell the thread waiting in rcom_send_message() that it can go ahead.
11655 +        */
11656 +
11657 +       set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11658 +       wake_up(&ls->ls_wait_general);
11659 +}
11660 +
11661 +static void process_reply_async(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11662 +{
11663 +       restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
11664 +                              reply->rc_msgid);
11665 +}
11666 +
11667 +/*
11668 + * Runs in same context as midcomms.
11669 + */
11670 +
11671 +static void rcom_process_reply(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11672 +{
11673 +       if (gdlm_recovery_stopped(ls)) {
11674 +               log_error(ls, "ignoring recovery reply %x from %u",
11675 +                         reply->rc_subcmd, nodeid);
11676 +               return;
11677 +       }
11678 +
11679 +       switch (reply->rc_subcmd) {
11680 +       case RECCOMM_GETMASTER:
11681 +               process_reply_async(ls, nodeid, reply);
11682 +               break;
11683 +       case RECCOMM_STATUS:
11684 +       case RECCOMM_NEWLOCKS:
11685 +       case RECCOMM_NEWLOCKIDS:
11686 +       case RECCOMM_RECOVERNAMES:
11687 +               process_reply_sync(ls, nodeid, reply);
11688 +               break;
11689 +       default:
11690 +               log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
11691 +                         reply->rc_subcmd, nodeid);
11692 +       }
11693 +}
11694 +
11695 +
11696 +static int send_ls_not_ready(uint32_t nodeid, struct gd_req_header *header)
11697 +{
11698 +       struct writequeue_entry *wq;
11699 +       gd_rcom_t *rc = (gd_rcom_t *) header;
11700 +       gd_rcom_t *reply;
11701 +
11702 +       wq = lowcomms_get_buffer(nodeid, sizeof(gd_rcom_t), GFP_KERNEL,
11703 +                                (char **)&reply);
11704 +       if (!wq)
11705 +               return -ENOMEM;
11706 +
11707 +       reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11708 +       reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11709 +       reply->rc_subcmd = rc->rc_subcmd;
11710 +       reply->rc_msgid = rc->rc_msgid;
11711 +       reply->rc_buf[0] = 0;
11712 +
11713 +       reply->rc_datalen = 1;
11714 +       reply->rc_header.rh_length = sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11715 +
11716 +       midcomms_send_buffer((struct gd_req_header *)reply, wq);
11717 +       return 0;
11718 +}
11719 +
11720 +
11721 +/*
11722 + * Runs in same context as midcomms.  Both recovery requests and recovery
11723 + * replies come through this function.
11724 + */
11725 +
11726 +void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header)
11727 +{
11728 +       gd_ls_t *ls = find_lockspace_by_global_id(header->rh_lockspace);
11729 +       gd_rcom_t *rc = (gd_rcom_t *) header;
11730 +
11731 +       /* If the lockspace doesn't exist then still send a status message
11732 +          back, it's possible that it just doesn't have it's global_id
11733 +          yet. */
11734 +       if (!ls) {
11735 +             send_ls_not_ready(nodeid, header);
11736 +             return;
11737 +       }
11738 +
11739 +       switch (header->rh_cmd) {
11740 +       case GDLM_REMCMD_RECOVERMESSAGE:
11741 +               down_read(&ls->ls_rec_rsblist);
11742 +               rcom_process_message(ls, nodeid, rc);
11743 +               up_read(&ls->ls_rec_rsblist);
11744 +               break;
11745 +
11746 +       case GDLM_REMCMD_RECOVERREPLY:
11747 +               rcom_process_reply(ls, nodeid, rc);
11748 +               break;
11749 +
11750 +       default:
11751 +               GDLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
11752 +       }
11753 +}
11754 +
11755 diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
11756 --- linux-orig/cluster/dlm/reccomms.h   1970-01-01 07:30:00.000000000 +0730
11757 +++ linux-patched/cluster/dlm/reccomms.h        2004-06-25 18:31:07.000000000 +0800
11758 @@ -0,0 +1,37 @@
11759 +/******************************************************************************
11760 +*******************************************************************************
11761 +**
11762 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11763 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11764 +**
11765 +**  This copyrighted material is made available to anyone wishing to use,
11766 +**  modify, copy, or redistribute it subject to the terms and conditions
11767 +**  of the GNU General Public License v.2.
11768 +**
11769 +*******************************************************************************
11770 +******************************************************************************/
11771 +
11772 +#ifndef __RECCOMMS_DOT_H__
11773 +#define __RECCOMMS_DOT_H__
11774 +
11775 +/* Bit flags */
11776 +
11777 +#define RESDIR_VALID            (1)
11778 +#define RESDIR_ALL_VALID        (2)
11779 +#define NODES_VALID             (4)
11780 +#define NODES_ALL_VALID         (8)
11781 +
11782 +#define RECCOMM_STATUS          (1)
11783 +#define RECCOMM_RECOVERNAMES    (2)
11784 +#define RECCOMM_GETMASTER       (3)
11785 +#define RECCOMM_BULKLOOKUP      (4)
11786 +#define RECCOMM_NEWLOCKS        (5)
11787 +#define RECCOMM_NEWLOCKIDS      (6)
11788 +#define RECCOMM_REMRESDATA      (7)
11789 +
11790 +int rcom_send_message(gd_ls_t * ls, uint32_t nodeid, int type, gd_rcom_t * rc,
11791 +                     int need_reply);
11792 +void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header);
11793 +void rcom_log_clear(gd_ls_t *ls);
11794 +
11795 +#endif
11796 diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
11797 --- linux-orig/cluster/dlm/recover.c    1970-01-01 07:30:00.000000000 +0730
11798 +++ linux-patched/cluster/dlm/recover.c 2004-06-25 18:31:07.000000000 +0800
11799 @@ -0,0 +1,632 @@
11800 +/******************************************************************************
11801 +*******************************************************************************
11802 +**
11803 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11804 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11805 +**
11806 +**  This copyrighted material is made available to anyone wishing to use,
11807 +**  modify, copy, or redistribute it subject to the terms and conditions
11808 +**  of the GNU General Public License v.2.
11809 +**
11810 +*******************************************************************************
11811 +******************************************************************************/
11812 +
11813 +#include "dlm_internal.h"
11814 +#include "reccomms.h"
11815 +#include "dir.h"
11816 +#include "locking.h"
11817 +#include "rsb.h"
11818 +#include "lockspace.h"
11819 +#include "lkb.h"
11820 +#include "nodes.h"
11821 +#include "config.h"
11822 +#include "ast.h"
11823 +#include "memory.h"
11824 +
11825 +/*
11826 + * Called in recovery routines to check whether the recovery process has been
11827 + * interrupted/stopped by another transition.  A recovery in-process will abort
11828 + * if the lockspace is "stopped" so that a new recovery process can start from
11829 + * the beginning when the lockspace is "started" again.
11830 + */
11831 +
11832 +int gdlm_recovery_stopped(gd_ls_t *ls)
11833 +{
11834 +       return test_bit(LSFL_LS_STOP, &ls->ls_flags);
11835 +}
11836 +
11837 +static void gdlm_wait_timer_fn(unsigned long data)
11838 +{
11839 +       gd_ls_t *ls = (gd_ls_t *) data;
11840 +
11841 +       wake_up(&ls->ls_wait_general);
11842 +}
11843 +
11844 +/*
11845 + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
11846 + * set due to failure of a node in ls_nodes).  When another function thinks it
11847 + * could have completed the waited-on task, they should wake up ls_wait_general
11848 + * to get an immediate response rather than waiting for the timer to detect the
11849 + * result.  A timer wakes us up periodically while waiting to see if we should
11850 + * abort due to a node failure.
11851 + */
11852 +
11853 +int gdlm_wait_function(gd_ls_t *ls, int (*testfn) (gd_ls_t * ls))
11854 +{
11855 +       struct timer_list timer;
11856 +       int error = 0;
11857 +
11858 +       init_timer(&timer);
11859 +       timer.function = gdlm_wait_timer_fn;
11860 +       timer.data = (long) ls;
11861 +
11862 +       for (;;) {
11863 +               mod_timer(&timer, jiffies + (5 * HZ));
11864 +
11865 +               wchan_cond_sleep_intr(ls->ls_wait_general,
11866 +                                     !testfn(ls) &&
11867 +                                     !test_bit(LSFL_LS_STOP, &ls->ls_flags));
11868 +
11869 +               if (timer_pending(&timer))
11870 +                       del_timer(&timer);
11871 +
11872 +               if (testfn(ls))
11873 +                       break;
11874 +
11875 +               if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
11876 +                       error = -1;
11877 +                       break;
11878 +               }
11879 +       }
11880 +
11881 +       return error;
11882 +}
11883 +
11884 +int gdlm_wait_status_all(gd_ls_t *ls, unsigned int wait_status)
11885 +{
11886 +       gd_rcom_t rc_stack, *rc;
11887 +       gd_csb_t *csb;
11888 +       int status;
11889 +       int error = 0;
11890 +
11891 +       memset(&rc_stack, 0, sizeof(gd_rcom_t));
11892 +       rc = &rc_stack;
11893 +       rc->rc_datalen = 0;
11894 +
11895 +       list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11896 +               for (;;) {
11897 +                       error = gdlm_recovery_stopped(ls);
11898 +                       if (error)
11899 +                               goto out;
11900 +
11901 +                       error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
11902 +                                                 RECCOMM_STATUS, rc, 1);
11903 +                       if (error)
11904 +                               goto out;
11905 +
11906 +                       status = rc->rc_buf[0];
11907 +                       if (status & wait_status)
11908 +                               break;
11909 +                       else {
11910 +                               set_current_state(TASK_INTERRUPTIBLE);
11911 +                               schedule_timeout(HZ >> 1);
11912 +                       }
11913 +               }
11914 +       }
11915 +
11916 +      out:
11917 +       return error;
11918 +}
11919 +
11920 +int gdlm_wait_status_low(gd_ls_t *ls, unsigned int wait_status)
11921 +{
11922 +       gd_rcom_t rc_stack, *rc;
11923 +       uint32_t nodeid = ls->ls_low_nodeid;
11924 +       int status;
11925 +       int error = 0;
11926 +
11927 +       memset(&rc_stack, 0, sizeof(gd_rcom_t));
11928 +       rc = &rc_stack;
11929 +       rc->rc_datalen = 0;
11930 +
11931 +       for (;;) {
11932 +               error = gdlm_recovery_stopped(ls);
11933 +               if (error)
11934 +                       goto out;
11935 +
11936 +               error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
11937 +               if (error)
11938 +                       break;
11939 +
11940 +               status = rc->rc_buf[0];
11941 +               if (status & wait_status)
11942 +                       break;
11943 +               else {
11944 +                       set_current_state(TASK_INTERRUPTIBLE);
11945 +                       schedule_timeout(HZ >> 1);
11946 +               }
11947 +       }
11948 +
11949 +      out:
11950 +       return error;
11951 +}
11952 +
11953 +static int purge_queue(gd_ls_t *ls, struct list_head *queue)
11954 +{
11955 +       gd_lkb_t *lkb, *safe;
11956 +       gd_res_t *rsb;
11957 +       int count = 0;
11958 +
11959 +       list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
11960 +               if (!lkb->lkb_nodeid)
11961 +                       continue;
11962 +
11963 +               GDLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
11964 +
11965 +               if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
11966 +                       list_del(&lkb->lkb_statequeue);
11967 +
11968 +                       rsb = lkb->lkb_resource;
11969 +                       lkb->lkb_status = 0;
11970 +
11971 +                       if (lkb->lkb_status == GDLM_LKSTS_CONVERT
11972 +                           && &lkb->lkb_duetime)
11973 +                               remove_from_deadlockqueue(lkb);
11974 +
11975 +                       release_lkb(ls, lkb);
11976 +                       release_rsb(rsb);
11977 +                       count++;
11978 +               }
11979 +       }
11980 +
11981 +       return count;
11982 +}
11983 +
11984 +/*
11985 + * Go through local restbl and for each rsb we're master of, clear out any
11986 + * lkb's held by departed nodes.
11987 + */
11988 +
11989 +int restbl_lkb_purge(gd_ls_t *ls)
11990 +{
11991 +       struct list_head *tmp2, *safe2;
11992 +       int count = 0;
11993 +       gd_res_t *rootrsb, *safe, *rsb;
11994 +
11995 +       log_all(ls, "purge locks of departed nodes");
11996 +
11997 +       list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
11998 +
11999 +               rootrsb->res_resdir_seq = 1;
12000 +
12001 +               if (rootrsb->res_nodeid)
12002 +                       continue;
12003 +
12004 +               hold_rsb(rootrsb);
12005 +               down_write(&rootrsb->res_lock);
12006 +
12007 +               /* This traverses the subreslist in reverse order so we purge
12008 +                * the children before their parents. */
12009 +
12010 +               for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
12011 +                    tmp2 != &rootrsb->res_subreslist;
12012 +                    tmp2 = safe2, safe2 = safe2->prev) {
12013 +                       rsb = list_entry(tmp2, gd_res_t, res_subreslist);
12014 +
12015 +                       hold_rsb(rsb);
12016 +                       purge_queue(ls, &rsb->res_grantqueue);
12017 +                       purge_queue(ls, &rsb->res_convertqueue);
12018 +                       purge_queue(ls, &rsb->res_waitqueue);
12019 +                       release_rsb(rsb);
12020 +               }
12021 +               count += purge_queue(ls, &rootrsb->res_grantqueue);
12022 +               count += purge_queue(ls, &rootrsb->res_convertqueue);
12023 +               count += purge_queue(ls, &rootrsb->res_waitqueue);
12024 +
12025 +               up_write(&rootrsb->res_lock);
12026 +               release_rsb(rootrsb);
12027 +       }
12028 +
12029 +       log_all(ls, "purged %d locks", count);
12030 +
12031 +       return 0;
12032 +}
12033 +
12034 +/*
12035 + * Grant any locks that have become grantable after a purge
12036 + */
12037 +
12038 +int restbl_grant_after_purge(gd_ls_t *ls)
12039 +{
12040 +       gd_res_t *root, *rsb, *safe;
12041 +       int error = 0;
12042 +
12043 +       down_write(&ls->ls_gap_rsblist);
12044 +
12045 +       list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12046 +               /* only the rsb master grants locks */
12047 +               if (root->res_nodeid)
12048 +                       continue;
12049 +
12050 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12051 +                       log_debug(ls, "restbl_grant_after_purge aborted");
12052 +                       error = -EINTR;
12053 +                       up_write(&ls->ls_gap_rsblist);
12054 +                       goto out;
12055 +               }
12056 +
12057 +               down_write(&root->res_lock);
12058 +               grant_pending_locks(root);
12059 +               up_write(&root->res_lock);
12060 +
12061 +               list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12062 +                       down_write(&rsb->res_lock);
12063 +                       grant_pending_locks(rsb);
12064 +                       up_write(&rsb->res_lock);
12065 +               }
12066 +       }
12067 +       up_write(&ls->ls_gap_rsblist);
12068 +       wake_astd();
12069 + out:
12070 +       return error;
12071 +}
12072 +
12073 +/*
12074 + * Set the lock master for all LKBs in a lock queue
12075 + */
12076 +
12077 +static void set_lock_master(struct list_head *queue, int nodeid)
12078 +{
12079 +       gd_lkb_t *lkb;
12080 +
12081 +       list_for_each_entry(lkb, queue, lkb_statequeue) {
12082 +               /* Don't muck around with pre-exising sublocks */
12083 +               if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12084 +                       lkb->lkb_nodeid = nodeid;
12085 +       }
12086 +}
12087 +
12088 +static void set_master_lkbs(gd_res_t *rsb)
12089 +{
12090 +       set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12091 +       set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12092 +       set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12093 +}
12094 +
12095 +/*
12096 + * This rsb struct is now the master so it is responsible for keeping the
12097 + * latest rsb.  Find if any current lkb's have an up to date copy of the lvb to
12098 + * be used as the rsb copy.  An equivalent step occurs as new lkb's arrive for
12099 + * this rsb in deserialise_lkb.
12100 + */
12101 +
12102 +static void set_rsb_lvb(gd_res_t *rsb)
12103 +{
12104 +       gd_lkb_t *lkb;
12105 +
12106 +       list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12107 +
12108 +               if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12109 +                   (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12110 +                   (lkb->lkb_grmode > DLM_LOCK_NL))
12111 +               {
12112 +                       if (!rsb->res_lvbptr)
12113 +                               rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12114 +
12115 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12116 +                       return;
12117 +               }
12118 +       }
12119 +
12120 +       list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12121 +
12122 +               if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12123 +                   (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12124 +                   (lkb->lkb_grmode > DLM_LOCK_NL))
12125 +               {
12126 +                       if (!rsb->res_lvbptr)
12127 +                               rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12128 +
12129 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12130 +                       return;
12131 +               }
12132 +       }
12133 +}
12134 +
12135 +/*
12136 + * Propogate the new master nodeid to locks, subrsbs, sublocks.
12137 + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12138 + */
12139 +
12140 +static void set_new_master(gd_res_t *rsb)
12141 +{
12142 +       gd_res_t *subrsb;
12143 +
12144 +       down_write(&rsb->res_lock);
12145 +
12146 +       if (rsb->res_nodeid == our_nodeid()) {
12147 +               rsb->res_nodeid = 0;
12148 +               set_rsb_lvb(rsb);
12149 +       }
12150 +
12151 +       set_master_lkbs(rsb);
12152 +
12153 +       list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
12154 +               subrsb->res_nodeid = rsb->res_nodeid;
12155 +               set_master_lkbs(subrsb);
12156 +       }
12157 +
12158 +       up_write(&rsb->res_lock);
12159 +
12160 +       set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
12161 +}
12162 +
12163 +/*
12164 + * The recover_list contains all the rsb's for which we've requested the new
12165 + * master nodeid.  As replies are returned from the resource directories the
12166 + * rsb's are removed from the list.  When the list is empty we're done.
12167 + *
12168 + * The recover_list is later similarly used for all rsb's for which we've sent
12169 + * new lkb's and need to receive new corresponding lkid's.
12170 + */
12171 +
12172 +int recover_list_empty(gd_ls_t *ls)
12173 +{
12174 +       int empty;
12175 +
12176 +       spin_lock(&ls->ls_recover_list_lock);
12177 +       empty = list_empty(&ls->ls_recover_list);
12178 +       spin_unlock(&ls->ls_recover_list_lock);
12179 +
12180 +       return empty;
12181 +}
12182 +
12183 +int recover_list_count(gd_ls_t *ls)
12184 +{
12185 +       int count;
12186 +
12187 +       spin_lock(&ls->ls_recover_list_lock);
12188 +       count = ls->ls_recover_list_count;
12189 +       spin_unlock(&ls->ls_recover_list_lock);
12190 +
12191 +       return count;
12192 +}
12193 +
12194 +void recover_list_add(gd_res_t *rsb)
12195 +{
12196 +       gd_ls_t *ls = rsb->res_ls;
12197 +
12198 +       spin_lock(&ls->ls_recover_list_lock);
12199 +       if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
12200 +               list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
12201 +               ls->ls_recover_list_count++;
12202 +               hold_rsb(rsb);
12203 +       }
12204 +       spin_unlock(&ls->ls_recover_list_lock);
12205 +}
12206 +
12207 +void recover_list_del(gd_res_t *rsb)
12208 +{
12209 +       gd_ls_t *ls = rsb->res_ls;
12210 +
12211 +       spin_lock(&ls->ls_recover_list_lock);
12212 +       clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
12213 +       list_del(&rsb->res_recover_list);
12214 +       ls->ls_recover_list_count--;
12215 +       spin_unlock(&ls->ls_recover_list_lock);
12216 +
12217 +       release_rsb(rsb);
12218 +}
12219 +
12220 +static gd_res_t *recover_list_find(gd_ls_t *ls, int msgid)
12221 +{
12222 +       gd_res_t *rsb = NULL;
12223 +
12224 +       spin_lock(&ls->ls_recover_list_lock);
12225 +
12226 +       list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
12227 +               if (rsb->res_recover_msgid == msgid)
12228 +                       goto rec_found;
12229 +       }
12230 +       rsb = NULL;
12231 +
12232 + rec_found:
12233 +       spin_unlock(&ls->ls_recover_list_lock);
12234 +       return rsb;
12235 +}
12236 +
12237 +#if 0
12238 +static void recover_list_clear(gd_ls_t *ls)
12239 +{
12240 +       gd_res_t *rsb;
12241 +
12242 +
12243 +       spin_lock(&ls->ls_recover_list_lock);
12244 +
12245 +       while (!list_empty(&ls->ls_recover_list)) {
12246 +               rsb = list_entry(ls->ls_recover_list.next, gd_res_t,
12247 +                                res_recover_list);
12248 +               list_del(&rsb->res_recover_list);
12249 +               ls->ls_recover_list_count--;
12250 +       }
12251 +       spin_unlock(&ls->ls_recover_list_lock);
12252 +
12253 +}
12254 +#endif
12255 +
12256 +#if 0
12257 +void recover_list_dump(gd_ls_t *ls)
12258 +{
12259 +       struct list_head *tmp;
12260 +       gd_res_t *rsb;
12261 +
12262 +       spin_lock(&ls->ls_recover_list_lock);
12263 +
12264 +       printk("recover_list_count=%d\n", ls->ls_recover_list_count);
12265 +
12266 +       list_for_each(tmp, &ls->ls_recover_list) {
12267 +               rsb = list_entry(tmp, gd_res_t, res_recover_list);
12268 +               gdlm_res_dbprint(rsb);
12269 +       }
12270 +       spin_unlock(&ls->ls_recover_list_lock);
12271 +}
12272 +#endif
12273 +
12274 +static int rsb_master_lookup(gd_res_t *rsb, gd_rcom_t *rc)
12275 +{
12276 +       gd_ls_t *ls = rsb->res_ls;
12277 +       gd_resdata_t *rd;
12278 +       uint32_t dir_nodeid;
12279 +       int error;
12280 +
12281 +       dir_nodeid = get_directory_nodeid(rsb);
12282 +
12283 +       if (dir_nodeid == our_nodeid()) {
12284 +               error = get_resdata(ls, dir_nodeid, rsb->res_name,
12285 +                                   rsb->res_length, &rd, 1);
12286 +               if (error)
12287 +                       goto fail;
12288 +
12289 +               rsb->res_nodeid = rd->rd_master_nodeid;
12290 +               set_new_master(rsb);
12291 +       } else {
12292 +               /* As we are the only thread doing recovery this
12293 +                  should be safe. if not then we need to use a different
12294 +                  ID somehow. We must set it in the RSB before rcom_send_msg
12295 +                  completes cos we may get a reply quite quickly.
12296 +               */
12297 +               rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
12298 +
12299 +               recover_list_add(rsb);
12300 +
12301 +               memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
12302 +               rc->rc_datalen = rsb->res_length;
12303 +
12304 +               error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
12305 +                                         rc, 0);
12306 +               if (error)
12307 +                       goto fail;
12308 +       }
12309 +
12310 +      fail:
12311 +       return error;
12312 +}
12313 +
12314 +/*
12315 + * Go through local root resources and for each rsb which has a master which
12316 + * has departed, get the new master nodeid from the resdir.  The resdir will
12317 + * assign mastery to the first node to look up the new master.  That means
12318 + * we'll discover in this lookup if we're the new master of any rsb's.
12319 + *
12320 + * We fire off all the resdir requests individually and asynchronously to the
12321 + * correct resdir node.  The replies are processed in rsb_master_recv().
12322 + */
12323 +
12324 +int restbl_rsb_update(gd_ls_t *ls)
12325 +{
12326 +       gd_res_t *rsb, *safe;
12327 +       gd_rcom_t *rc;
12328 +       int error = -ENOMEM;
12329 +       int count = 0;
12330 +
12331 +       log_all(ls, "update remastered resources");
12332 +
12333 +       rc = allocate_rcom_buffer(ls);
12334 +       if (!rc)
12335 +               goto out;
12336 +
12337 +       list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
12338 +               if (!rsb->res_nodeid)
12339 +                       continue;
12340 +
12341 +               error = gdlm_recovery_stopped(ls);
12342 +               if (error)
12343 +                       goto out_free;
12344 +
12345 +               if (in_nodes_gone(ls, rsb->res_nodeid)) {
12346 +                       error = rsb_master_lookup(rsb, rc);
12347 +                       if (error)
12348 +                               goto out_free;
12349 +                       count++;
12350 +               }
12351 +       }
12352 +
12353 +       error = gdlm_wait_function(ls, &recover_list_empty);
12354 +
12355 +       log_all(ls, "updated %d resources", count);
12356 +
12357 +      out_free:
12358 +       free_rcom_buffer(rc);
12359 +
12360 +      out:
12361 +       return error;
12362 +}
12363 +
12364 +int restbl_rsb_update_recv(gd_ls_t *ls, uint32_t nodeid, char *buf, int length,
12365 +                          int msgid)
12366 +{
12367 +       gd_res_t *rsb;
12368 +       uint32_t be_nodeid;
12369 +
12370 +       rsb = recover_list_find(ls, msgid);
12371 +       if (!rsb) {
12372 +               log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
12373 +               goto out;
12374 +       }
12375 +
12376 +       memcpy(&be_nodeid, buf, sizeof(uint32_t));
12377 +       rsb->res_nodeid = be32_to_cpu(be_nodeid);
12378 +       set_new_master(rsb);
12379 +       recover_list_del(rsb);
12380 +
12381 +       if (recover_list_empty(ls))
12382 +               wake_up(&ls->ls_wait_general);
12383 +
12384 +      out:
12385 +       return 0;
12386 +}
12387 +
12388 +/*
12389 + * This function not used any longer.
12390 + */
12391 +
12392 +int bulk_master_lookup(gd_ls_t *ls, int nodeid, char *inbuf, int inlen,
12393 +                      char *outbuf)
12394 +{
12395 +       char *inbufptr, *outbufptr;
12396 +
12397 +       /*
12398 +        * The other node wants nodeids matching the resource names in inbuf.
12399 +        * The resource names are packed into inbuf as
12400 +        * [len1][name1][len2][name2]...  where lenX is 1 byte and nameX is
12401 +        * lenX bytes.  Matching nodeids are packed into outbuf in order
12402 +        * [nodeid1][nodeid2]...
12403 +        */
12404 +
12405 +       inbufptr = inbuf;
12406 +       outbufptr = outbuf;
12407 +
12408 +       while (inbufptr < inbuf + inlen) {
12409 +               gd_resdata_t *rd;
12410 +               uint32_t be_nodeid;
12411 +               int status;
12412 +
12413 +               status = get_resdata(ls, nodeid, inbufptr + 1, *inbufptr,
12414 +                                    &rd, 1);
12415 +               if (status != 0)
12416 +                       goto fail;
12417 +
12418 +               inbufptr += *inbufptr + 1;
12419 +
12420 +               be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
12421 +               memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
12422 +               outbufptr += sizeof(uint32_t);
12423 +
12424 +               /* add assertion that outbufptr - outbuf is not > than ... */
12425 +       }
12426 +
12427 +       return (outbufptr - outbuf);
12428 +
12429 +      fail:
12430 +       return -1;
12431 +}
12432 diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
12433 --- linux-orig/cluster/dlm/recover.h    1970-01-01 07:30:00.000000000 +0730
12434 +++ linux-patched/cluster/dlm/recover.h 2004-06-25 18:31:07.000000000 +0800
12435 @@ -0,0 +1,34 @@
12436 +/******************************************************************************
12437 +*******************************************************************************
12438 +**
12439 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12440 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12441 +**
12442 +**  This copyrighted material is made available to anyone wishing to use,
12443 +**  modify, copy, or redistribute it subject to the terms and conditions
12444 +**  of the GNU General Public License v.2.
12445 +**
12446 +*******************************************************************************
12447 +******************************************************************************/
12448 +
12449 +#ifndef __RECOVER_DOT_H__
12450 +#define __RECOVER_DOT_H__
12451 +
12452 +int gdlm_wait_function(gd_ls_t * ls, int (*testfn) (gd_ls_t * ls));
12453 +int gdlm_wait_status_all(gd_ls_t * ls, unsigned int wait_status);
12454 +int gdlm_wait_status_low(gd_ls_t * ls, unsigned int wait_status);
12455 +int gdlm_recovery_stopped(gd_ls_t * ls);
12456 +int recover_list_empty(gd_ls_t * ls);
12457 +int recover_list_count(gd_ls_t * ls);
12458 +void recover_list_add(gd_res_t * rsb);
12459 +void recover_list_del(gd_res_t * rsb);
12460 +void recover_list_dump(gd_ls_t * ls);
12461 +int restbl_lkb_purge(gd_ls_t * ls);
12462 +void restbl_grant_after_purge(gd_ls_t * ls);
12463 +int restbl_rsb_update(gd_ls_t * ls);
12464 +int restbl_rsb_update_recv(gd_ls_t * ls, int nodeid, char *buf, int len,
12465 +                          int msgid);
12466 +int bulk_master_lookup(gd_ls_t * ls, int nodeid, char *inbuf, int inlen,
12467 +                      char *outbuf);
12468 +
12469 +#endif                         /* __RECOVER_DOT_H__ */
12470 diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
12471 --- linux-orig/cluster/dlm/recoverd.c   1970-01-01 07:30:00.000000000 +0730
12472 +++ linux-patched/cluster/dlm/recoverd.c        2004-06-25 18:31:07.000000000 +0800
12473 @@ -0,0 +1,692 @@
12474 +/******************************************************************************
12475 +*******************************************************************************
12476 +**
12477 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12478 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12479 +**
12480 +**  This copyrighted material is made available to anyone wishing to use,
12481 +**  modify, copy, or redistribute it subject to the terms and conditions
12482 +**  of the GNU General Public License v.2.
12483 +**
12484 +*******************************************************************************
12485 +******************************************************************************/
12486 +
12487 +#include "dlm_internal.h"
12488 +#include "nodes.h"
12489 +#include "dir.h"
12490 +#include "ast.h"
12491 +#include "recover.h"
12492 +#include "lockspace.h"
12493 +#include "lowcomms.h"
12494 +#include "lockqueue.h"
12495 +#include "lkb.h"
12496 +#include "rebuild.h"
12497 +
12498 +/*
12499 + * next_move actions
12500 + */
12501 +
12502 +#define DO_STOP             (1)
12503 +#define DO_START            (2)
12504 +#define DO_FINISH           (3)
12505 +#define DO_FINISH_STOP      (4)
12506 +#define DO_FINISH_START     (5)
12507 +
12508 +/*
12509 + * recoverd_flags for thread
12510 + */
12511 +
12512 +#define THREAD_STOP         (0)
12513 +
12514 +/*
12515 + * local thread variables
12516 + */
12517 +
12518 +static unsigned long recoverd_flags;
12519 +static struct completion recoverd_run;
12520 +static wait_queue_head_t recoverd_wait;
12521 +static struct task_struct *recoverd_task;
12522 +
12523 +/*
12524 + * Queue of lockspaces (gr_recover_t structs) which need to be
12525 + * started/recovered
12526 + */
12527 +
12528 +static struct list_head recoverd_start_queue;
12529 +static atomic_t recoverd_start_count;
12530 +
12531 +extern struct list_head lslist;
12532 +extern spinlock_t lslist_lock;
12533 +
12534 +void dlm_recoverd_init(void)
12535 +{
12536 +       INIT_LIST_HEAD(&recoverd_start_queue);
12537 +       atomic_set(&recoverd_start_count, 0);
12538 +
12539 +       init_completion(&recoverd_run);
12540 +       init_waitqueue_head(&recoverd_wait);
12541 +       memset(&recoverd_flags, 0, sizeof(unsigned long));
12542 +}
12543 +
12544 +static int enable_locking(gd_ls_t *ls, int event_id)
12545 +{
12546 +       int error = 0;
12547 +
12548 +       spin_lock(&ls->ls_recover_lock);
12549 +       if (ls->ls_last_stop < event_id) {
12550 +               set_bit(LSFL_LS_RUN, &ls->ls_flags);
12551 +               up_write(&ls->ls_in_recovery);
12552 +       } else {
12553 +               error = -EINTR;
12554 +               log_debug(ls, "enable_locking: abort %d", event_id);
12555 +       }
12556 +       spin_unlock(&ls->ls_recover_lock);
12557 +       return error;
12558 +}
12559 +
12560 +static int ls_first_start(gd_ls_t *ls, gd_recover_t *gr)
12561 +{
12562 +       int error;
12563 +
12564 +       log_all(ls, "recover event %u (first)", gr->gr_event_id);
12565 +
12566 +       kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
12567 +
12568 +       error = ls_nodes_init(ls, gr);
12569 +       if (error) {
12570 +               log_error(ls, "nodes_init failed %d", error);
12571 +               goto out;
12572 +       }
12573 +
12574 +       error = resdir_rebuild_local(ls);
12575 +       if (error) {
12576 +               log_error(ls, "resdir_rebuild_local failed %d", error);
12577 +               goto out;
12578 +       }
12579 +
12580 +       error = resdir_rebuild_wait(ls);
12581 +       if (error) {
12582 +               log_error(ls, "resdir_rebuild_wait failed %d", error);
12583 +               goto out;
12584 +       }
12585 +
12586 +       log_all(ls, "recover event %u done", gr->gr_event_id);
12587 +       kcl_start_done(ls->ls_local_id, gr->gr_event_id);
12588 +
12589 +      out:
12590 +       return error;
12591 +}
12592 +
12593 +/*
12594 + * We are given here a new group of nodes which are in the lockspace.  We first
12595 + * figure out the differences in ls membership from when we were last running.
12596 + * If nodes from before are gone, then there will be some lock recovery to do.
12597 + * If there are only nodes which have joined, then there's no lock recovery.
12598 + *
12599 + * note: cman requires an rc to finish starting on an revent (where nodes die)
12600 + * before it allows an sevent (where nodes join) to be processed.  This means
12601 + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
12602 + * joined.
12603 + */
12604 +
12605 +static int ls_reconfig(gd_ls_t *ls, gd_recover_t *gr)
12606 +{
12607 +       int error, neg = 0;
12608 +
12609 +       log_all(ls, "recover event %u", gr->gr_event_id);
12610 +
12611 +       /*
12612 +        * Add or remove nodes from the lockspace's ls_nodes list.
12613 +        */
12614 +
12615 +       error = ls_nodes_reconfig(ls, gr, &neg);
12616 +       if (error) {
12617 +               log_error(ls, "nodes_reconfig failed %d", error);
12618 +               goto fail;
12619 +       }
12620 +
12621 +       /*
12622 +        * Rebuild our own share of the resdir by collecting from all other
12623 +        * nodes rsb name/master pairs for which the name hashes to us.
12624 +        */
12625 +
12626 +       error = resdir_rebuild_local(ls);
12627 +       if (error) {
12628 +               log_error(ls, "resdir_rebuild_local failed %d", error);
12629 +               goto fail;
12630 +       }
12631 +
12632 +       /*
12633 +        * Purge resdir-related requests that are being held in requestqueue.
12634 +        * All resdir requests from before recovery started are invalid now due
12635 +        * to the resdir rebuild and will be resent by the requesting nodes.
12636 +        */
12637 +
12638 +       purge_requestqueue(ls);
12639 +       set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12640 +
12641 +       /*
12642 +        * Wait for all nodes to complete resdir rebuild.
12643 +        */
12644 +
12645 +       error = resdir_rebuild_wait(ls);
12646 +       if (error) {
12647 +               log_error(ls, "resdir_rebuild_wait failed %d", error);
12648 +               goto fail;
12649 +       }
12650 +
12651 +       /*
12652 +        * Mark our own lkb's waiting in the lockqueue for remote replies from
12653 +        * nodes that are now departed.  These will be resent to the new
12654 +        * masters in resend_cluster_requests.  Also mark resdir lookup
12655 +        * requests for resending.
12656 +        */
12657 +
12658 +       lockqueue_lkb_mark(ls);
12659 +
12660 +       error = gdlm_recovery_stopped(ls);
12661 +       if (error)
12662 +               goto fail;
12663 +
12664 +       if (neg) {
12665 +               /*
12666 +                * Clear lkb's for departed nodes.  This can't fail since it
12667 +                * doesn't involve communicating with other nodes.
12668 +                */
12669 +
12670 +               down_write(&ls->ls_rec_rsblist);
12671 +               restbl_lkb_purge(ls);
12672 +               up_write(&ls->ls_rec_rsblist);
12673 +
12674 +               down_read(&ls->ls_rec_rsblist);
12675 +
12676 +               /*
12677 +                * Get new master id's for rsb's of departed nodes.  This fails
12678 +                * if we can't communicate with other nodes.
12679 +                */
12680 +
12681 +               error = restbl_rsb_update(ls);
12682 +               if (error) {
12683 +                       log_error(ls, "restbl_rsb_update failed %d", error);
12684 +                       goto fail_up;
12685 +               }
12686 +
12687 +               /*
12688 +                * Send our lkb info to new masters.  This fails if we can't
12689 +                * communicate with a node.
12690 +                */
12691 +
12692 +               error = rebuild_rsbs_send(ls);
12693 +               if (error) {
12694 +                       log_error(ls, "rebuild_rsbs_send failed %d", error);
12695 +                       goto fail_up;
12696 +               }
12697 +               up_read(&ls->ls_rec_rsblist);
12698 +       }
12699 +
12700 +       clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12701 +
12702 +       log_all(ls, "recover event %u done", gr->gr_event_id);
12703 +       kcl_start_done(ls->ls_local_id, gr->gr_event_id);
12704 +       return 0;
12705 +
12706 + fail_up:
12707 +       up_read(&ls->ls_rec_rsblist);
12708 + fail:
12709 +       log_all(ls, "recover event %d error %d", gr->gr_event_id, error);
12710 +       return error;
12711 +}
12712 +
12713 +static void clear_finished_nodes(gd_ls_t *ls, int finish_event)
12714 +{
12715 +       gd_csb_t *csb, *safe;
12716 +
12717 +       list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, csb_list) {
12718 +               if (csb->csb_gone_event <= finish_event) {
12719 +                       list_del(&csb->csb_list);
12720 +                       release_csb(csb);
12721 +               }
12722 +       }
12723 +}
12724 +
12725 +/*
12726 + * Between calls to this routine for a ls, there can be multiple stop/start
12727 + * events from cman where every start but the latest is cancelled by stops.
12728 + * There can only be a single finish from cman because every finish requires us
12729 + * to call start_done.  A single finish event could be followed by multiple
12730 + * stop/start events.  This routine takes any combination of events from cman
12731 + * and boils them down to one course of action.
12732 + */
12733 +
12734 +int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out)
12735 +{
12736 +       LIST_HEAD(events);
12737 +       unsigned int cmd = 0, stop, start, finish;
12738 +       unsigned int last_stop, last_start, last_finish;
12739 +       gd_recover_t *gr = NULL, *start_gr = NULL;
12740 +
12741 +       /*
12742 +        * Grab the current state of cman/sm events.
12743 +        */
12744 +
12745 +       spin_lock(&ls->ls_recover_lock);
12746 +
12747 +       stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
12748 +       start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
12749 +       finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
12750 +
12751 +       last_stop = ls->ls_last_stop;
12752 +       last_start = ls->ls_last_start;
12753 +       last_finish = ls->ls_last_finish;
12754 +
12755 +       while (!list_empty(&ls->ls_recover)) {
12756 +               gr = list_entry(ls->ls_recover.next, gd_recover_t, gr_list);
12757 +               list_del(&gr->gr_list);
12758 +               list_add_tail(&gr->gr_list, &events);
12759 +       }
12760 +       spin_unlock(&ls->ls_recover_lock);
12761 +
12762 +       log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
12763 +                 last_stop, last_start, last_finish);
12764 +
12765 +       /*
12766 +        * Toss start events which have since been cancelled.
12767 +        */
12768 +
12769 +       while (!list_empty(&events)) {
12770 +               GDLM_ASSERT(start,);
12771 +               gr = list_entry(events.next, gd_recover_t, gr_list);
12772 +               list_del(&gr->gr_list);
12773 +
12774 +               if (gr->gr_event_id <= last_stop) {
12775 +                       log_debug(ls, "move skip event %u", gr->gr_event_id);
12776 +                       kfree(gr->gr_nodeids);
12777 +                       free_dlm_recover(gr);
12778 +                       gr = NULL;
12779 +               } else {
12780 +                       log_debug(ls, "move use event %u", gr->gr_event_id);
12781 +                       GDLM_ASSERT(!start_gr,);
12782 +                       start_gr = gr;
12783 +               }
12784 +       }
12785 +
12786 +       /*
12787 +        * Eight possible combinations of events.
12788 +        */
12789 +
12790 +       /* 0 */
12791 +       if (!stop && !start && !finish) {
12792 +               GDLM_ASSERT(!start_gr,);
12793 +               cmd = 0;
12794 +               goto out;
12795 +       }
12796 +
12797 +       /* 1 */
12798 +       if (!stop && !start && finish) {
12799 +               GDLM_ASSERT(!start_gr,);
12800 +               GDLM_ASSERT(last_start > last_stop,);
12801 +               GDLM_ASSERT(last_finish == last_start,);
12802 +               cmd = DO_FINISH;
12803 +               *finish_out = last_finish;
12804 +               goto out;
12805 +       }
12806 +
12807 +       /* 2 */
12808 +       if (!stop && start && !finish) {
12809 +               GDLM_ASSERT(start_gr,);
12810 +               GDLM_ASSERT(last_start > last_stop,);
12811 +               cmd = DO_START;
12812 +               *gr_out = start_gr;
12813 +               goto out;
12814 +       }
12815 +
12816 +       /* 3 */
12817 +       if (!stop && start && finish) {
12818 +               GDLM_ASSERT(0, printk("finish and start with no stop\n"););
12819 +       }
12820 +
12821 +       /* 4 */
12822 +       if (stop && !start && !finish) {
12823 +               GDLM_ASSERT(!start_gr,);
12824 +               GDLM_ASSERT(last_start == last_stop,);
12825 +               cmd = DO_STOP;
12826 +               goto out;
12827 +       }
12828 +
12829 +       /* 5 */
12830 +       if (stop && !start && finish) {
12831 +               GDLM_ASSERT(!start_gr,);
12832 +               GDLM_ASSERT(last_finish == last_start,);
12833 +               GDLM_ASSERT(last_stop == last_start,);
12834 +               cmd = DO_FINISH_STOP;
12835 +               *finish_out = last_finish;
12836 +               goto out;
12837 +       }
12838 +
12839 +       /* 6 */
12840 +       if (stop && start && !finish) {
12841 +               if (start_gr) {
12842 +                       GDLM_ASSERT(last_start > last_stop,);
12843 +                       cmd = DO_START;
12844 +                       *gr_out = start_gr;
12845 +               } else {
12846 +                       GDLM_ASSERT(last_stop == last_start,);
12847 +                       cmd = DO_STOP;
12848 +               }
12849 +               goto out;
12850 +       }
12851 +
12852 +       /* 7 */
12853 +       if (stop && start && finish) {
12854 +               if (start_gr) {
12855 +                       GDLM_ASSERT(last_start > last_stop,);
12856 +                       GDLM_ASSERT(last_start > last_finish,);
12857 +                       cmd = DO_FINISH_START;
12858 +                       *finish_out = last_finish;
12859 +                       *gr_out = start_gr;
12860 +               } else {
12861 +                       GDLM_ASSERT(last_start == last_stop,);
12862 +                       GDLM_ASSERT(last_start > last_finish,);
12863 +                       cmd = DO_FINISH_STOP;
12864 +                       *finish_out = last_finish;
12865 +               }
12866 +               goto out;
12867 +       }
12868 +
12869 +      out:
12870 +       return cmd;
12871 +}
12872 +
12873 +/*
12874 + * This function decides what to do given every combination of current
12875 + * lockspace state and next lockspace state.
12876 + */
12877 +
12878 +static void do_ls_recovery(gd_ls_t *ls)
12879 +{
12880 +       gd_recover_t *gr = NULL;
12881 +       int error, cur_state, next_state = 0, do_now, finish_event = 0;
12882 +
12883 +       do_now = next_move(ls, &gr, &finish_event);
12884 +       if (!do_now)
12885 +               goto out;
12886 +
12887 +       cur_state = ls->ls_state;
12888 +       next_state = 0;
12889 +
12890 +       GDLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
12891 +                   log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
12892 +
12893 +       /*
12894 +        * LSST_CLEAR - we're not in any recovery state.  We can get a stop or
12895 +        * a stop and start which equates with a START.
12896 +        */
12897 +
12898 +       if (cur_state == LSST_CLEAR) {
12899 +               switch (do_now) {
12900 +               case DO_STOP:
12901 +                       next_state = LSST_WAIT_START;
12902 +                       break;
12903 +
12904 +               case DO_START:
12905 +                       error = ls_reconfig(ls, gr);
12906 +                       if (error)
12907 +                               next_state = LSST_WAIT_START;
12908 +                       else
12909 +                               next_state = LSST_RECONFIG_DONE;
12910 +                       break;
12911 +
12912 +               case DO_FINISH: /* invalid */
12913 +               case DO_FINISH_STOP:    /* invalid */
12914 +               case DO_FINISH_START:   /* invalid */
12915 +               default:
12916 +                       GDLM_ASSERT(0,);
12917 +               }
12918 +               goto out;
12919 +       }
12920 +
12921 +       /*
12922 +        * LSST_WAIT_START - we're not running because of getting a stop or
12923 +        * failing a start.  We wait in this state for another stop/start or
12924 +        * just the next start to begin another reconfig attempt.
12925 +        */
12926 +
12927 +       if (cur_state == LSST_WAIT_START) {
12928 +               switch (do_now) {
12929 +               case DO_STOP:
12930 +                       break;
12931 +
12932 +               case DO_START:
12933 +                       error = ls_reconfig(ls, gr);
12934 +                       if (error)
12935 +                               next_state = LSST_WAIT_START;
12936 +                       else
12937 +                               next_state = LSST_RECONFIG_DONE;
12938 +                       break;
12939 +
12940 +               case DO_FINISH: /* invalid */
12941 +               case DO_FINISH_STOP:    /* invalid */
12942 +               case DO_FINISH_START:   /* invalid */
12943 +               default:
12944 +                       GDLM_ASSERT(0,);
12945 +               }
12946 +               goto out;
12947 +       }
12948 +
12949 +       /*
12950 +        * LSST_RECONFIG_DONE - we entered this state after successfully
12951 +        * completing ls_reconfig and calling kcl_start_done.  We expect to get
12952 +        * a finish if everything goes ok.  A finish could be followed by stop
12953 +        * or stop/start before we get here to check it.  Or a finish may never
12954 +        * happen, only stop or stop/start.
12955 +        */
12956 +
12957 +       if (cur_state == LSST_RECONFIG_DONE) {
12958 +               switch (do_now) {
12959 +               case DO_FINISH:
12960 +                       clear_finished_nodes(ls, finish_event);
12961 +                       next_state = LSST_CLEAR;
12962 +
12963 +                       error = enable_locking(ls, finish_event);
12964 +                       if (error)
12965 +                               break;
12966 +
12967 +                       error = process_requestqueue(ls);
12968 +                       if (error)
12969 +                               break;
12970 +
12971 +                       error = resend_cluster_requests(ls);
12972 +                       if (error)
12973 +                               break;
12974 +
12975 +                       restbl_grant_after_purge(ls);
12976 +
12977 +                       log_all(ls, "recover event %u finished", finish_event);
12978 +                       break;
12979 +
12980 +               case DO_STOP:
12981 +                       next_state = LSST_WAIT_START;
12982 +                       break;
12983 +
12984 +               case DO_FINISH_STOP:
12985 +                       clear_finished_nodes(ls, finish_event);
12986 +                       next_state = LSST_WAIT_START;
12987 +                       break;
12988 +
12989 +               case DO_FINISH_START:
12990 +                       clear_finished_nodes(ls, finish_event);
12991 +                       /* fall into DO_START */
12992 +
12993 +               case DO_START:
12994 +                       error = ls_reconfig(ls, gr);
12995 +                       if (error)
12996 +                               next_state = LSST_WAIT_START;
12997 +                       else
12998 +                               next_state = LSST_RECONFIG_DONE;
12999 +                       break;
13000 +
13001 +               default:
13002 +                       GDLM_ASSERT(0,);
13003 +               }
13004 +               goto out;
13005 +       }
13006 +
13007 +       /*
13008 +        * LSST_INIT - state after ls is created and before it has been
13009 +        * started.  A start operation will cause the ls to be started for the
13010 +        * first time.  A failed start will cause to just wait in INIT for
13011 +        * another stop/start.
13012 +        */
13013 +
13014 +       if (cur_state == LSST_INIT) {
13015 +               switch (do_now) {
13016 +               case DO_START:
13017 +                       error = ls_first_start(ls, gr);
13018 +                       if (!error)
13019 +                               next_state = LSST_INIT_DONE;
13020 +                       break;
13021 +
13022 +               case DO_STOP:
13023 +                       break;
13024 +
13025 +               case DO_FINISH: /* invalid */
13026 +               case DO_FINISH_STOP:    /* invalid */
13027 +               case DO_FINISH_START:   /* invalid */
13028 +               default:
13029 +                       GDLM_ASSERT(0,);
13030 +               }
13031 +               goto out;
13032 +       }
13033 +
13034 +       /*
13035 +        * LSST_INIT_DONE - after the first start operation is completed
13036 +        * successfully and kcl_start_done() called.  If there are no errors, a
13037 +        * finish will arrive next and we'll move to LSST_CLEAR.
13038 +        */
13039 +
13040 +       if (cur_state == LSST_INIT_DONE) {
13041 +               switch (do_now) {
13042 +               case DO_STOP:
13043 +               case DO_FINISH_STOP:
13044 +                       next_state = LSST_WAIT_START;
13045 +                       break;
13046 +
13047 +               case DO_START:
13048 +               case DO_FINISH_START:
13049 +                       error = ls_reconfig(ls, gr);
13050 +                       if (error)
13051 +                               next_state = LSST_WAIT_START;
13052 +                       else
13053 +                               next_state = LSST_RECONFIG_DONE;
13054 +                       break;
13055 +
13056 +               case DO_FINISH:
13057 +                       next_state = LSST_CLEAR;
13058 +                       enable_locking(ls, finish_event);
13059 +                       log_all(ls, "recover event %u finished", finish_event);
13060 +                       break;
13061 +
13062 +               default:
13063 +                       GDLM_ASSERT(0,);
13064 +               }
13065 +               goto out;
13066 +       }
13067 +
13068 +      out:
13069 +       if (next_state)
13070 +               ls->ls_state = next_state;
13071 +
13072 +       if (gr) {
13073 +               kfree(gr->gr_nodeids);
13074 +               free_dlm_recover(gr);
13075 +       }
13076 +}
13077 +
13078 +static __inline__ gd_ls_t *get_work(int clear)
13079 +{
13080 +       gd_ls_t *ls;
13081 +
13082 +       spin_lock(&lslist_lock);
13083 +
13084 +       list_for_each_entry(ls, &lslist, ls_list) {
13085 +               if (clear) {
13086 +                       if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
13087 +                               goto got_work;
13088 +
13089 +               } else {
13090 +                       if (test_bit(LSFL_WORK, &ls->ls_flags))
13091 +                               goto got_work;
13092 +               }
13093 +       }
13094 +       ls = NULL;
13095 +
13096 + got_work:
13097 +       spin_unlock(&lslist_lock);
13098 +
13099 +       return ls;
13100 +}
13101 +
13102 +/*
13103 + * Thread which does recovery for all lockspaces.
13104 + */
13105 +
13106 +static int dlm_recoverd(void *arg)
13107 +{
13108 +       gd_ls_t *ls;
13109 +
13110 +       daemonize("dlm_recoverd");
13111 +       recoverd_task = current;
13112 +       complete(&recoverd_run);
13113 +
13114 +       while (!test_bit(THREAD_STOP, &recoverd_flags)) {
13115 +               wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
13116 +               if ((ls = get_work(1)))
13117 +                       do_ls_recovery(ls);
13118 +       }
13119 +
13120 +       complete(&recoverd_run);
13121 +       return 0;
13122 +}
13123 +
13124 +/*
13125 + * Mark a specific lockspace as needing work and wake up the thread to do it.
13126 + */
13127 +
13128 +void recoverd_kick(gd_ls_t *ls)
13129 +{
13130 +       set_bit(LSFL_WORK, &ls->ls_flags);
13131 +       wake_up(&recoverd_wait);
13132 +}
13133 +
13134 +/*
13135 + * Start the recoverd thread when gdlm is started (before any lockspaces).
13136 + */
13137 +
13138 +int recoverd_start(void)
13139 +{
13140 +       int error;
13141 +
13142 +       clear_bit(THREAD_STOP, &recoverd_flags);
13143 +       error = kernel_thread(dlm_recoverd, NULL, 0);
13144 +       if (error < 0)
13145 +               goto out;
13146 +
13147 +       error = 0;
13148 +       wait_for_completion(&recoverd_run);
13149 +
13150 +      out:
13151 +       return error;
13152 +}
13153 +
13154 +/*
13155 + * Stop the recoverd thread when gdlm is shut down (all lockspaces are gone).
13156 + */
13157 +
13158 +int recoverd_stop(void)
13159 +{
13160 +       set_bit(THREAD_STOP, &recoverd_flags);
13161 +       wake_up(&recoverd_wait);
13162 +       wait_for_completion(&recoverd_run);
13163 +
13164 +       return 0;
13165 +}
13166 diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
13167 --- linux-orig/cluster/dlm/recoverd.h   1970-01-01 07:30:00.000000000 +0730
13168 +++ linux-patched/cluster/dlm/recoverd.h        2004-06-25 18:31:07.000000000 +0800
13169 @@ -0,0 +1,22 @@
13170 +/******************************************************************************
13171 +*******************************************************************************
13172 +**
13173 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13174 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13175 +**
13176 +**  This copyrighted material is made available to anyone wishing to use,
13177 +**  modify, copy, or redistribute it subject to the terms and conditions
13178 +**  of the GNU General Public License v.2.
13179 +**
13180 +*******************************************************************************
13181 +******************************************************************************/
13182 +
13183 +#ifndef __RECOVERD_DOT_H__
13184 +#define __RECOVERD_DOT_H__
13185 +
13186 +void dlm_recoverd_init(void);
13187 +void recoverd_kick(gd_ls_t * ls);
13188 +int recoverd_start(void);
13189 +int recoverd_stop(void);
13190 +
13191 +#endif                         /* __RECOVERD_DOT_H__ */
13192 diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
13193 --- linux-orig/cluster/dlm/rsb.c        1970-01-01 07:30:00.000000000 +0730
13194 +++ linux-patched/cluster/dlm/rsb.c     2004-06-25 18:31:07.000000000 +0800
13195 @@ -0,0 +1,307 @@
13196 +/******************************************************************************
13197 +*******************************************************************************
13198 +**
13199 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13200 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13201 +**
13202 +**  This copyrighted material is made available to anyone wishing to use,
13203 +**  modify, copy, or redistribute it subject to the terms and conditions
13204 +**  of the GNU General Public License v.2.
13205 +**
13206 +*******************************************************************************
13207 +******************************************************************************/
13208 +
13209 +#include "dlm_internal.h"
13210 +#include "locking.h"
13211 +#include "memory.h"
13212 +#include "lockqueue.h"
13213 +#include "nodes.h"
13214 +#include "dir.h"
13215 +#include "util.h"
13216 +
13217 +static gd_res_t *search_hashchain(struct list_head *head, gd_res_t *parent,
13218 +                                 char *name, int namelen)
13219 +{
13220 +       gd_res_t *r;
13221 +
13222 +       list_for_each_entry(r, head, res_hashchain) {
13223 +               if ((parent == r->res_parent) && (namelen == r->res_length) &&
13224 +                   (memcmp(name, r->res_name, namelen) == 0)) {
13225 +                       atomic_inc(&r->res_ref);
13226 +                       return r;
13227 +               }
13228 +       }
13229 +
13230 +       return NULL;
13231 +}
13232 +
13233 +/*
13234 + * A way to arbitrarily hold onto an rsb which we already have a reference to
13235 + * to make sure it doesn't go away.  Opposite of release_rsb().
13236 + */
13237 +
13238 +void hold_rsb(gd_res_t *r)
13239 +{
13240 +       atomic_inc(&r->res_ref);
13241 +}
13242 +
13243 +/*
13244 + * release_rsb() - Decrement reference count on rsb struct.  Free the rsb
13245 + * struct when there are zero references.  Every lkb for the rsb adds a
13246 + * reference.  When ref is zero there can be no more lkb's for the rsb, on the
13247 + * queue's or anywhere else.
13248 + */
13249 +
13250 +void release_rsb(gd_res_t *r)
13251 +{
13252 +       gd_ls_t *ls = r->res_ls;
13253 +       int removed = FALSE;
13254 +
13255 +       write_lock(&ls->ls_reshash_lock);
13256 +       atomic_dec(&r->res_ref);
13257 +
13258 +       if (!atomic_read(&r->res_ref)) {
13259 +               GDLM_ASSERT(list_empty(&r->res_grantqueue),);
13260 +               GDLM_ASSERT(list_empty(&r->res_waitqueue),);
13261 +               GDLM_ASSERT(list_empty(&r->res_convertqueue),);
13262 +               removed = TRUE;
13263 +               list_del(&r->res_hashchain);
13264 +       }
13265 +       write_unlock(&ls->ls_reshash_lock);
13266 +
13267 +       if (removed) {
13268 +               down_read(&ls->ls_gap_rsblist);
13269 +               if (r->res_parent)
13270 +                       list_del(&r->res_subreslist);
13271 +               else
13272 +                       list_del(&r->res_rootlist);
13273 +               up_read(&ls->ls_gap_rsblist);
13274 +
13275 +               /*
13276 +                * Remove resdir entry if this was a locally mastered root rsb.
13277 +                */
13278 +               if (!r->res_parent && !r->res_nodeid) {
13279 +                       if (get_directory_nodeid(r) != our_nodeid())
13280 +                               remote_remove_resdata(r->res_ls,
13281 +                                                     get_directory_nodeid(r),
13282 +                                                     r->res_name,
13283 +                                                     r->res_length,
13284 +                                                     r->res_resdir_seq);
13285 +                       else
13286 +                               remove_resdata(r->res_ls, our_nodeid(),
13287 +                                              r->res_name, r->res_length,
13288 +                                              r->res_resdir_seq);
13289 +               }
13290 +
13291 +               if (r->res_lvbptr)
13292 +                       free_lvb(r->res_lvbptr);
13293 +
13294 +               free_rsb(r);
13295 +       }
13296 +}
13297 +
13298 +/*
13299 + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
13300 + * If the rsb exists, its ref count is incremented by this function.  If it
13301 + * doesn't exist, it's created with a ref count of one.
13302 + */
13303 +
13304 +int find_or_create_rsb(gd_ls_t *ls, gd_res_t *parent, char *name, int namelen,
13305 +                      int create, gd_res_t **rp)
13306 +{
13307 +       uint32_t hash;
13308 +       gd_res_t *r, *tmp;
13309 +       int error = -ENOMEM;
13310 +
13311 +       GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
13312 +
13313 +       hash = gdlm_hash(name, namelen);
13314 +       hash &= ls->ls_hashmask;
13315 +
13316 +       read_lock(&ls->ls_reshash_lock);
13317 +       r = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
13318 +       read_unlock(&ls->ls_reshash_lock);
13319 +
13320 +       if (r)
13321 +               goto out_set;
13322 +       if (!create) {
13323 +               *rp = NULL;
13324 +               goto out;
13325 +       }
13326 +
13327 +       r = allocate_rsb(ls, namelen);
13328 +       if (!r)
13329 +               goto fail;
13330 +
13331 +       INIT_LIST_HEAD(&r->res_subreslist);
13332 +       INIT_LIST_HEAD(&r->res_grantqueue);
13333 +       INIT_LIST_HEAD(&r->res_convertqueue);
13334 +       INIT_LIST_HEAD(&r->res_waitqueue);
13335 +
13336 +       memcpy(r->res_name, name, namelen);
13337 +       r->res_length = namelen;
13338 +       r->res_ls = ls;
13339 +       init_rwsem(&r->res_lock);
13340 +       atomic_set(&r->res_ref, 1);
13341 +
13342 +       if (parent) {
13343 +               r->res_parent = parent;
13344 +               r->res_depth = parent->res_depth + 1;
13345 +               r->res_root = parent->res_root;
13346 +               r->res_nodeid = parent->res_nodeid;
13347 +       } else {
13348 +               r->res_parent = NULL;
13349 +               r->res_depth = 1;
13350 +               r->res_root = r;
13351 +               r->res_nodeid = -1;
13352 +       }
13353 +
13354 +       write_lock(&ls->ls_reshash_lock);
13355 +       tmp = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
13356 +       if (tmp) {
13357 +               write_unlock(&ls->ls_reshash_lock);
13358 +               free_rsb(r);
13359 +               r = tmp;
13360 +       } else {
13361 +               list_add(&r->res_hashchain, &ls->ls_reshashtbl[hash]);
13362 +               write_unlock(&ls->ls_reshash_lock);
13363 +
13364 +               down_read(&ls->ls_gap_rsblist);
13365 +               if (parent)
13366 +                       list_add_tail(&r->res_subreslist,
13367 +                                     &r->res_root->res_subreslist);
13368 +               else
13369 +                       list_add(&r->res_rootlist, &ls->ls_rootres);
13370 +               up_read(&ls->ls_gap_rsblist);
13371 +       }
13372 +
13373 +      out_set:
13374 +       *rp = r;
13375 +
13376 +      out:
13377 +       error = 0;
13378 +
13379 +      fail:
13380 +       return error;
13381 +}
13382 +
13383 +/*
13384 + * Add a LKB to a resource's grant/convert/wait queue. in order
13385 + */
13386 +
13387 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
13388 +{
13389 +       gd_lkb_t *lkb = NULL;
13390 +
13391 +       list_for_each_entry(lkb, head, lkb_statequeue) {
13392 +               if (lkb->lkb_rqmode < mode)
13393 +                       break;
13394 +       }
13395 +
13396 +       if (!lkb) {
13397 +               /* No entries in the queue, we are alone */
13398 +               list_add_tail(new, head);
13399 +       } else {
13400 +               __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
13401 +       }
13402 +}
13403 +
13404 +/*
13405 + * The rsb res_lock must be held in write when this function is called.
13406 + */
13407 +
13408 +void lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13409 +{
13410 +
13411 +       GDLM_ASSERT(!lkb->lkb_status, printk("status=%u\n", lkb->lkb_status););
13412 +
13413 +       lkb->lkb_status = type;
13414 +
13415 +       switch (type) {
13416 +       case GDLM_LKSTS_WAITING:
13417 +               list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
13418 +               break;
13419 +
13420 +       case GDLM_LKSTS_GRANTED:
13421 +               lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
13422 +                               lkb->lkb_grmode);
13423 +               break;
13424 +
13425 +       case GDLM_LKSTS_CONVERT:
13426 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
13427 +                       list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
13428 +
13429 +               else
13430 +                       if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
13431 +                               list_add_tail(&lkb->lkb_statequeue,
13432 +                                             &r->res_convertqueue);
13433 +                       else
13434 +                               lkb_add_ordered(&lkb->lkb_statequeue,
13435 +                                               &r->res_convertqueue, lkb->lkb_rqmode);
13436 +               break;
13437 +
13438 +       default:
13439 +               GDLM_ASSERT(0,);
13440 +       }
13441 +}
13442 +
13443 +void res_lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13444 +{
13445 +       down_write(&r->res_lock);
13446 +       lkb_enqueue(r, lkb, type);
13447 +       up_write(&r->res_lock);
13448 +}
13449 +
13450 +/*
13451 + * The rsb res_lock must be held in write when this function is called.
13452 + */
13453 +
13454 +int lkb_dequeue(gd_lkb_t *lkb)
13455 +{
13456 +       int status = lkb->lkb_status;
13457 +
13458 +       if (!status)
13459 +               goto out;
13460 +
13461 +       lkb->lkb_status = 0;
13462 +       list_del(&lkb->lkb_statequeue);
13463 +
13464 +      out:
13465 +       return status;
13466 +}
13467 +
13468 +int res_lkb_dequeue(gd_lkb_t *lkb)
13469 +{
13470 +       int status;
13471 +
13472 +       down_write(&lkb->lkb_resource->res_lock);
13473 +       status = lkb_dequeue(lkb);
13474 +       up_write(&lkb->lkb_resource->res_lock);
13475 +
13476 +       return status;
13477 +}
13478 +
13479 +/*
13480 + * The rsb res_lock must be held in write when this function is called.
13481 + */
13482 +
13483 +int lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13484 +{
13485 +       int status;
13486 +
13487 +       status = lkb_dequeue(lkb);
13488 +       lkb_enqueue(r, lkb, type);
13489 +
13490 +       return status;
13491 +}
13492 +
13493 +int res_lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13494 +{
13495 +       int status;
13496 +
13497 +       down_write(&r->res_lock);
13498 +       status = lkb_swqueue(r, lkb, type);
13499 +       up_write(&r->res_lock);
13500 +
13501 +       return status;
13502 +}
13503 diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
13504 --- linux-orig/cluster/dlm/rsb.h        1970-01-01 07:30:00.000000000 +0730
13505 +++ linux-patched/cluster/dlm/rsb.h     2004-06-25 18:31:07.000000000 +0800
13506 @@ -0,0 +1,30 @@
13507 +/******************************************************************************
13508 +*******************************************************************************
13509 +**
13510 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13511 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13512 +**
13513 +**  This copyrighted material is made available to anyone wishing to use,
13514 +**  modify, copy, or redistribute it subject to the terms and conditions
13515 +**  of the GNU General Public License v.2.
13516 +**
13517 +*******************************************************************************
13518 +******************************************************************************/
13519 +
13520 +#ifndef __RSB_DOT_H__
13521 +#define __RSB_DOT_H__
13522 +
13523 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
13524 +void _release_rsb(gd_res_t * r);
13525 +void release_rsb(gd_res_t * r);
13526 +void hold_rsb(gd_res_t * r);
13527 +int find_or_create_rsb(gd_ls_t * ls, gd_res_t * parent, char *name, int namelen,
13528 +                      int create, gd_res_t ** rp);
13529 +void lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13530 +void res_lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13531 +int lkb_dequeue(gd_lkb_t * lkb);
13532 +int res_lkb_dequeue(gd_lkb_t * lkb);
13533 +int lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13534 +int res_lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13535 +
13536 +#endif                         /* __RSB_DOT_H__ */
13537 diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
13538 --- linux-orig/cluster/dlm/util.c       1970-01-01 07:30:00.000000000 +0730
13539 +++ linux-patched/cluster/dlm/util.c    2004-06-25 18:31:07.000000000 +0800
13540 @@ -0,0 +1,130 @@
13541 +/******************************************************************************
13542 +*******************************************************************************
13543 +**
13544 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13545 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13546 +**
13547 +**  This copyrighted material is made available to anyone wishing to use,
13548 +**  modify, copy, or redistribute it subject to the terms and conditions
13549 +**  of the GNU General Public License v.2.
13550 +**
13551 +*******************************************************************************
13552 +******************************************************************************/
13553 +
13554 +#include "dlm_internal.h"
13555 +
13556 +static const uint32_t crc_32_tab[] = {
13557 +       0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
13558 +       0xe963a535, 0x9e6495a3,
13559 +       0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
13560 +       0xe7b82d07, 0x90bf1d91,
13561 +       0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
13562 +       0xf4d4b551, 0x83d385c7,
13563 +       0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
13564 +       0xfa0f3d63, 0x8d080df5,
13565 +       0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
13566 +       0xd20d85fd, 0xa50ab56b,
13567 +       0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
13568 +       0xdcd60dcf, 0xabd13d59,
13569 +       0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
13570 +       0xcfba9599, 0xb8bda50f,
13571 +       0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
13572 +       0xc1611dab, 0xb6662d3d,
13573 +       0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
13574 +       0x9fbfe4a5, 0xe8b8d433,
13575 +       0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
13576 +       0x91646c97, 0xe6635c01,
13577 +       0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
13578 +       0x8208f4c1, 0xf50fc457,
13579 +       0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
13580 +       0x8cd37cf3, 0xfbd44c65,
13581 +       0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
13582 +       0xa4d1c46d, 0xd3d6f4fb,
13583 +       0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
13584 +       0xaa0a4c5f, 0xdd0d7cc9,
13585 +       0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
13586 +       0xb966d409, 0xce61e49f,
13587 +       0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
13588 +       0xb7bd5c3b, 0xc0ba6cad,
13589 +       0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
13590 +       0x04db2615, 0x73dc1683,
13591 +       0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
13592 +       0x0a00ae27, 0x7d079eb1,
13593 +       0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
13594 +       0x196c3671, 0x6e6b06e7,
13595 +       0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
13596 +       0x17b7be43, 0x60b08ed5,
13597 +       0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
13598 +       0x3fb506dd, 0x48b2364b,
13599 +       0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
13600 +       0x316e8eef, 0x4669be79,
13601 +       0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
13602 +       0x220216b9, 0x5505262f,
13603 +       0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
13604 +       0x2cd99e8b, 0x5bdeae1d,
13605 +       0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
13606 +       0x72076785, 0x05005713,
13607 +       0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
13608 +       0x7cdcefb7, 0x0bdbdf21,
13609 +       0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
13610 +       0x6fb077e1, 0x18b74777,
13611 +       0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
13612 +       0x616bffd3, 0x166ccf45,
13613 +       0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
13614 +       0x4969474d, 0x3e6e77db,
13615 +       0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
13616 +       0x47b2cf7f, 0x30b5ffe9,
13617 +       0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
13618 +       0x54de5729, 0x23d967bf,
13619 +       0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
13620 +       0x5a05df1b, 0x2d02ef8d
13621 +};
13622 +
13623 +/**
13624 + * gdlm_hash - hash an array of data
13625 + * @data: the data to be hashed
13626 + * @len: the length of data to be hashed
13627 + *
13628 + * Copied from GFS.
13629 + *
13630 + * Take some data and convert it to a 32-bit hash.
13631 + *
13632 + * The hash function is a 32-bit CRC of the data.  The algorithm uses
13633 + * the crc_32_tab table above.
13634 + *
13635 + * This may not be the fastest hash function, but it does a fair bit better
13636 + * at providing uniform results than the others I've looked at.  That's
13637 + * really important for efficient directories.
13638 + *
13639 + * Returns: the hash
13640 + */
13641 +
13642 +uint32_t gdlm_hash(const char *data, int len)
13643 +{
13644 +       uint32_t hash = 0xFFFFFFFF;
13645 +
13646 +       for (; len--; data++)
13647 +               hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
13648 +
13649 +       hash = ~hash;
13650 +
13651 +       return hash;
13652 +}
13653 +
13654 +uint32_t gdlm_next_power2(uint32_t val)
13655 +{
13656 +       uint32_t x;
13657 +
13658 +       for (x = 1; x < val; x <<= 1) ;
13659 +
13660 +       return x;
13661 +}
13662 +
13663 +void print_lkb(gd_lkb_t *lkb)
13664 +{
13665 +       printk("dlm: lkb id=%x remid=%x flags=%x status=%x rq=%d gr=%d "
13666 +               "nodeid=%u lqstate=%x lqflags=%x\n",
13667 +               lkb->lkb_id, lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_status,
13668 +               lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_nodeid,
13669 +               lkb->lkb_lockqueue_state, lkb->lkb_lockqueue_flags);
13670 +}
13671 diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
13672 --- linux-orig/cluster/dlm/util.h       1970-01-01 07:30:00.000000000 +0730
13673 +++ linux-patched/cluster/dlm/util.h    2004-06-25 18:31:07.000000000 +0800
13674 @@ -0,0 +1,22 @@
13675 +/******************************************************************************
13676 +*******************************************************************************
13677 +**
13678 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13679 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13680 +**
13681 +**  This copyrighted material is made available to anyone wishing to use,
13682 +**  modify, copy, or redistribute it subject to the terms and conditions
13683 +**  of the GNU General Public License v.2.
13684 +**
13685 +*******************************************************************************
13686 +******************************************************************************/
13687 +
13688 +#ifndef __UTIL_DOT_H__
13689 +#define __UTIL_DOT_H__
13690 +
13691 +uint32_t gdlm_hash(const char *data, int len);
13692 +uint32_t gdlm_next_power2(uint32_t val);
13693 +
13694 +void print_lkb(gd_lkb_t *lkb);
13695 +
13696 +#endif
13697 diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
13698 --- linux-orig/include/cluster/dlm.h    1970-01-01 07:30:00.000000000 +0730
13699 +++ linux-patched/include/cluster/dlm.h 2004-06-25 18:31:07.000000000 +0800
13700 @@ -0,0 +1,404 @@
13701 +/******************************************************************************
13702 +*******************************************************************************
13703 +**
13704 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13705 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13706 +**
13707 +**  This copyrighted material is made available to anyone wishing to use,
13708 +**  modify, copy, or redistribute it subject to the terms and conditions
13709 +**  of the GNU General Public License v.2.
13710 +**
13711 +*******************************************************************************
13712 +******************************************************************************/
13713 +
13714 +#ifndef __DLM_DOT_H__
13715 +#define __DLM_DOT_H__
13716 +
13717 +/*
13718 + * Interface to DLM - routines and structures to use DLM lockspaces.
13719 + */
13720 +
13721 +/*
13722 + * Lock Modes
13723 + */
13724 +
13725 +#define DLM_LOCK_IV            (-1)    /* invalid */
13726 +#define DLM_LOCK_NL            (0)     /* null */
13727 +#define DLM_LOCK_CR            (1)     /* concurrent read */
13728 +#define DLM_LOCK_CW            (2)     /* concurrent write */
13729 +#define DLM_LOCK_PR            (3)     /* protected read */
13730 +#define DLM_LOCK_PW            (4)     /* protected write */
13731 +#define DLM_LOCK_EX            (5)     /* exclusive */
13732 +
13733 +/*
13734 + * Maximum size in bytes of a dlm_lock name
13735 + */
13736 +
13737 +#define DLM_RESNAME_MAXLEN     (64)
13738 +
13739 +/*
13740 + * Size in bytes of Lock Value Block
13741 + */
13742 +
13743 +#define DLM_LVB_LEN            (32)
13744 +
13745 +/*
13746 + * Flags to dlm_new_lockspace
13747 + *
13748 + * DLM_LSF_NOTIMERS
13749 + *
13750 + * Do not subject locks in this lockspace to time-outs.
13751 + *
13752 + */
13753 +
13754 +#define DLM_LSF_NOTIMERS       (1)
13755 +
13756 +/*
13757 + * Flags to dlm_lock
13758 + *
13759 + * DLM_LKF_NOQUEUE
13760 + *
13761 + * Do not queue the lock request on the wait queue if it cannot be granted
13762 + * immediately.  If the lock cannot be granted because of this flag, DLM will
13763 + * either return -EAGAIN from the dlm_lock call or will return 0 from
13764 + * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
13765 + *
13766 + * DLM_LKF_CONVERT
13767 + *
13768 + * Indicates a lock conversion request.  For conversions the name and namelen
13769 + * are ignored and the lock ID in the LKSB is used to identify the lock.
13770 + *
13771 + * DLM_LKF_VALBLK
13772 + *
13773 + * Requests DLM to return the current contents of the lock value block in the
13774 + * lock status block.  When this flag is set in a lock conversion from PW or EX
13775 + * modes, DLM assigns the value specified in the lock status block to the lock
13776 + * value block of the lock resource.  The LVB is a DLM_LVB_LEN size array
13777 + * containing application-specific information.
13778 + *
13779 + * DLM_LKF_QUECVT
13780 + *
13781 + * Force a conversion lock request to the back of the convert queue.  All other
13782 + * conversion requests ahead of it must be granted before it can be granted.
13783 + * This enforces a FIFO ordering on the convert queue.  When this flag is set,
13784 + * indefinite postponement is averted.  This flag is allowed only when
13785 + * converting a lock to a more restrictive mode.
13786 + *
13787 + * DLM_LKF_CANCEL
13788 + *
13789 + * Used to cancel a pending conversion (with dlm_unlock).  Lock is returned to
13790 + * previously granted mode.
13791 + *
13792 + * DLM_LKF_IVVALBLK
13793 + *
13794 + * Invalidate/clear the lock value block.
13795 + *
13796 + * DLM_LKF_CONVDEADLK
13797 + *
13798 + * The granted mode of a lock being converted (from a non-NL mode) can be
13799 + * changed to NL in the process of acquiring the requested mode to avoid
13800 + * conversion deadlock.
13801 + *
13802 + * DLM_LKF_PERSISTENT
13803 + *
13804 + * Only relevant to locks originating in userspace. Signals to the ioctl.c code
13805 + * that this lock should not be unlocked when the process exits.
13806 + *
13807 + * DLM_LKF_NODLKWT
13808 + *
13809 + * This lock is not to be checked for conversion deadlocks.
13810 + *
13811 + * DLM_LKF_NODLCKBLK
13812 + *
13813 + * not yet implemented
13814 + *
13815 + * DLM_LKF_EXPEDITE
13816 + *
13817 + * If this lock conversion cannot be granted immediately it is to go to the
13818 + * head of the conversion queue regardless of its requested lock mode.
13819 + *
13820 + * DLM_LKF_NOQUEUEBAST
13821 + *
13822 + * Send blocking AST's before returning -EAGAIN to the caller.  It is only
13823 + * used along with the NOQUEUE flag.  Blocking AST's are not sent for failed
13824 + * NOQUEUE requests otherwise.
13825 + *
13826 + */
13827 +
13828 +#define DLM_LKF_NOQUEUE        (0x00000001)
13829 +#define DLM_LKF_CANCEL         (0x00000002)
13830 +#define DLM_LKF_CONVERT        (0x00000004)
13831 +#define DLM_LKF_VALBLK         (0x00000008)
13832 +#define DLM_LKF_QUECVT         (0x00000010)
13833 +#define DLM_LKF_IVVALBLK       (0x00000020)
13834 +#define DLM_LKF_CONVDEADLK     (0x00000040)
13835 +#define DLM_LKF_PERSISTENT     (0x00000080)
13836 +#define DLM_LKF_NODLCKWT       (0x00000100)
13837 +#define DLM_LKF_NODLCKBLK      (0x00000200)
13838 +#define DLM_LKF_EXPEDITE       (0x00000400)
13839 +#define DLM_LKF_NOQUEUEBAST    (0x00000800)
13840 +
13841 +/*
13842 + * Some return codes that are not not in errno.h
13843 + */
13844 +
13845 +#define DLM_ECANCEL            (0x10001)
13846 +#define DLM_EUNLOCK            (0x10002)
13847 +
13848 +typedef void dlm_lockspace_t;
13849 +
13850 +/*
13851 + * Lock range structure
13852 + */
13853 +
13854 +struct dlm_range {
13855 +       uint64_t ra_start;
13856 +       uint64_t ra_end;
13857 +};
13858 +
13859 +/*
13860 + * Lock status block
13861 + *
13862 + * Use this structure to specify the contents of the lock value block.  For a
13863 + * conversion request, this structure is used to specify the lock ID of the
13864 + * lock.  DLM writes the status of the lock request and the lock ID assigned
13865 + * to the request in the lock status block.
13866 + *
13867 + * sb_lkid: the returned lock ID.  It is set on new (non-conversion) requests.
13868 + * It is available when dlm_lock returns.
13869 + *
13870 + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
13871 + * shown for the DLM_LKF_VALBLK flag.
13872 + *
13873 + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
13874 + * it was first demoted to NL to avoid conversion deadlock.
13875 + *
13876 + * sb_status: the returned status of the lock request set prior to AST
13877 + * execution.  Possible return values:
13878 + *
13879 + * 0 if lock request was successful
13880 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
13881 + * -ENOMEM if there is no memory to process request
13882 + * -EINVAL if there are invalid parameters
13883 + * -DLM_EUNLOCK if unlock request was successful
13884 + * -DLM_ECANCEL ?
13885 + */
13886 +
13887 +#define DLM_SBF_DEMOTED        (0x01)
13888 +
13889 +struct dlm_lksb {
13890 +       int      sb_status;
13891 +       uint32_t sb_lkid;
13892 +       char     sb_flags;
13893 +       char *   sb_lvbptr;
13894 +};
13895 +
13896 +/*
13897 + * These defines are the bits that make up the
13898 + * query code.
13899 + */
13900 +
13901 +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
13902 + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
13903 +#define DLM_LOCK_THIS            0x0007
13904 +#define DLM_QUERY_MODE_MASK      0x0007
13905 +
13906 +/* Bits 3, 4, 5  bitmap of queue(s) to query */
13907 +#define DLM_QUERY_QUEUE_WAIT     0x0008
13908 +#define DLM_QUERY_QUEUE_CONVERT  0x0010
13909 +#define DLM_QUERY_QUEUE_GRANT    0x0020
13910 +#define DLM_QUERY_QUEUE_GRANTED  0x0030        /* Shorthand */
13911 +#define DLM_QUERY_QUEUE_ALL      0x0038        /* Shorthand */
13912 +
13913 +/* Bit 6, Return only the information that can be established without a network
13914 + * round-trip. The caller must be aware of the implications of this. Useful for
13915 + * just getting the master node id or resource name. */
13916 +#define DLM_QUERY_LOCAL          0x0040
13917 +
13918 +/* Bits 8 up, query type */
13919 +#define DLM_QUERY_LOCKS_HIGHER   0x0100
13920 +#define DLM_QUERY_LOCKS_LOWER    0x0200
13921 +#define DLM_QUERY_LOCKS_EQUAL    0x0300
13922 +#define DLM_QUERY_LOCKS_BLOCKING 0x0400
13923 +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
13924 +#define DLM_QUERY_LOCKS_ALL      0x0600
13925 +#define DLM_QUERY_MASK           0x0F00
13926 +
13927 +/* GRMODE is the default for mode comparisons,
13928 +   RQMODE might also be handy */
13929 +#define DLM_QUERY_GRMODE         0x0000
13930 +#define DLM_QUERY_RQMODE         0x1000
13931 +
13932 +/* Structures passed into and out of the query */
13933 +
13934 +struct dlm_lockinfo {
13935 +       int lki_lkid;           /* Lock ID on originating node */
13936 +        int lki_mstlkid;        /* Lock ID on master node */
13937 +       int lki_parent;
13938 +       int lki_node;           /* Originating node (not master) */
13939 +       uint8_t lki_state;      /* Queue the lock is on */
13940 +       uint8_t lki_grmode;     /* Granted mode */
13941 +       uint8_t lki_rqmode;     /* Requested mode */
13942 +       struct dlm_range lki_grrange;   /* Granted range, if applicable */
13943 +       struct dlm_range lki_rqrange;   /* Requested range, if applicable */
13944 +};
13945 +
13946 +struct dlm_resinfo {
13947 +       int rsi_length;
13948 +       int rsi_grantcount;     /* No. of nodes on grant queue */
13949 +       int rsi_convcount;      /* No. of nodes on convert queue */
13950 +       int rsi_waitcount;      /* No. of nodes on wait queue */
13951 +       int rsi_masternode;     /* Master for this resource */
13952 +       char rsi_name[DLM_RESNAME_MAXLEN];      /* Resource name */
13953 +       char rsi_valblk[DLM_LVB_LEN];   /* Master's LVB contents, if applicable
13954 +                                        */
13955 +};
13956 +
13957 +struct dlm_queryinfo {
13958 +       struct dlm_resinfo *gqi_resinfo;
13959 +       struct dlm_lockinfo *gqi_lockinfo;      /* This points to an array
13960 +                                                * of structs */
13961 +       int gqi_locksize;       /* input */
13962 +       int gqi_lockcount;      /* output */
13963 +};
13964 +
13965 +#ifdef __KERNEL__
13966 +/*
13967 + * dlm_init
13968 + *
13969 + * Starts and initializes DLM threads and structures.  Creation of the first
13970 + * lockspace will call this if it has not been called already.
13971 + *
13972 + * Returns: 0 if successful, -EXXX on error
13973 + */
13974 +
13975 +int dlm_init(void);
13976 +
13977 +/*
13978 + * dlm_release
13979 + *
13980 + * Stops DLM threads.
13981 + *
13982 + * Returns: 0 if successful, -EXXX on error
13983 + */
13984 +
13985 +int dlm_release(void);
13986 +
13987 +/*
13988 + * dlm_new_lockspace
13989 + *
13990 + * Starts a lockspace with the given name.  If the named lockspace exists in
13991 + * the cluster, the calling node joins it.
13992 + */
13993 +
13994 +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
13995 +                     int flags);
13996 +
13997 +/*
13998 + * dlm_release_lockspace
13999 + *
14000 + * Stop a lockspace.
14001 + */
14002 +
14003 +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
14004 +
14005 +/*
14006 + * dlm_lock
14007 + *
14008 + * Make an asyncronous request to acquire or convert a lock on a named
14009 + * resource.
14010 + *
14011 + * lockspace: context for the request
14012 + * mode: the requested mode of the lock (DLM_LOCK_)
14013 + * lksb: lock status block for input and async return values
14014 + * flags: input flags (DLM_LKF_)
14015 + * name: name of the resource to lock, can be binary
14016 + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
14017 + * parent: the lock ID of a parent lock or 0 if none
14018 + * lockast: function DLM executes when it completes processing the request
14019 + * astarg: argument passed to lockast and bast functions
14020 + * bast: function DLM executes when this lock later blocks another request
14021 + *
14022 + * Returns:
14023 + * 0 if request is successfully queued for processing
14024 + * -EINVAL if any input parameters are invalid
14025 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14026 + * -ENOMEM if there is no memory to process request
14027 + * -ENOTCONN if there is a communication error
14028 + *
14029 + * If the call to dlm_lock returns an error then the operation has failed and
14030 + * the AST routine will not be called.  If dlm_lock returns 0 it is still
14031 + * possible that the lock operation will fail. The AST routine will be called
14032 + * when the locking is complete and the status is returned in the lksb.
14033 + *
14034 + * If the AST routines or parameter are passed to a conversion operation then
14035 + * they will overwrite those values that were passed to a previous dlm_lock
14036 + * call.
14037 + *
14038 + * AST routines should not block (at least not for long), but may make
14039 + * any locking calls they please.
14040 + */
14041 +
14042 +int dlm_lock(dlm_lockspace_t *lockspace,
14043 +            uint32_t mode,
14044 +            struct dlm_lksb *lksb,
14045 +            uint32_t flags,
14046 +            void *name,
14047 +            unsigned int namelen,
14048 +            uint32_t parent,
14049 +            void (*lockast) (void *astarg),
14050 +            void *astarg,
14051 +            void (*bast) (void *astarg, int mode),
14052 +            struct dlm_range *range);
14053 +
14054 +/*
14055 + * dlm_unlock
14056 + *
14057 + * Asynchronously release a lock on a resource.  The AST routine is called
14058 + * when the resource is successfully unlocked.
14059 + *
14060 + * lockspace: context for the request
14061 + * lkid: the lock ID as returned in the lksb
14062 + * flags: input flags (DLM_LKF_)
14063 + * lksb: if NULL the lksb parameter passed to last lock request is used
14064 + * astarg: if NULL, astarg in last lock request is used
14065 + *
14066 + * Returns:
14067 + * 0 if request is successfully queued for processing
14068 + * -EINVAL if any input parameters are invalid
14069 + * -ENOTEMPTY if the lock still has sublocks
14070 + * -EBUSY if the lock is waiting for a remote lock operation
14071 + * -ENOTCONN if there is a communication error
14072 + */
14073 +
14074 +extern int dlm_unlock(dlm_lockspace_t *lockspace,
14075 +                      uint32_t lkid,
14076 +                      uint32_t flags,
14077 +                      struct dlm_lksb *lksb,
14078 +                      void *astarg);
14079 +
14080 +/* Query interface
14081 + *
14082 + * Query the other holders of a resource, given a known lock ID
14083 + *
14084 + * lockspace:   context for the request
14085 + * lksb:        LKSB, sb_lkid contains the lock ID of a valid lock
14086 + *              on the resource. sb_status will contain the status
14087 + *             of the request on completion.
14088 + * query:       query bitmap see DLM_QUERY_* above
14089 + * qinfo:       pointer to dlm_queryinfo structure
14090 + * ast_routine: AST routine to call on completion
14091 + * artarg:      argument to AST routine. It is "traditional"
14092 + *              to put the qinfo pointer into lksb->sb_lvbptr
14093 + *              and pass the lksb in here.
14094 + */
14095 +extern int dlm_query(dlm_lockspace_t *lockspace,
14096 +                     struct dlm_lksb *lksb,
14097 +                     int query,
14098 +                     struct dlm_queryinfo *qinfo,
14099 +                     void (ast_routine(void *)),
14100 +                     void *astarg);
14101 +
14102 +#endif                         /* __KERNEL__ */
14103 +
14104 +#endif                         /* __DLM_DOT_H__ */
14105 diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
14106 --- linux-orig/include/cluster/dlm_device.h     1970-01-01 07:30:00.000000000 +0730
14107 +++ linux-patched/include/cluster/dlm_device.h  2004-06-25 18:31:07.000000000 +0800
14108 @@ -0,0 +1,63 @@
14109 +/******************************************************************************
14110 +*******************************************************************************
14111 +**
14112 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14113 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14114 +**
14115 +**  This copyrighted material is made available to anyone wishing to use,
14116 +**  modify, copy, or redistribute it subject to the terms and conditions
14117 +**  of the GNU General Public License v.2.
14118 +**
14119 +*******************************************************************************
14120 +******************************************************************************/
14121 +
14122 +/* This is the device interface for dlm, most users will use a library
14123 + * interface.
14124 + */
14125 +
14126 +/* Version of the device interface */
14127 +#define DLM_DEVICE_VERSION_MAJOR 2
14128 +#define DLM_DEVICE_VERSION_MINOR 0
14129 +#define DLM_DEVICE_VERSION_PATCH 0
14130 +
14131 +/* struct passed to the lock write */
14132 +struct dlm_lock_params {
14133 +       uint32_t version[3];
14134 +       uint8_t cmd;
14135 +       uint8_t mode;
14136 +       uint16_t flags;
14137 +       uint32_t lkid;
14138 +       uint32_t parent;
14139 +       struct dlm_range range;
14140 +       uint8_t namelen;
14141 +        void *astparam;
14142 +        void *astaddr;
14143 +        void *bastaddr;
14144 +        struct dlm_lksb *lksb;
14145 +       char name[1];
14146 +};
14147 +
14148 +
14149 +/* struct read from the "device" fd,
14150 +   consists mainly of userspace pointers for the library to use */
14151 +struct dlm_lock_result {
14152 +       uint8_t cmd;
14153 +        void *astparam;
14154 +        void (*astaddr)(void *astparam);
14155 +        struct dlm_lksb *user_lksb;
14156 +        struct dlm_lksb lksb;  /* But this has real data in it */
14157 +        uint8_t bast_mode; /* Not yet used */
14158 +};
14159 +
14160 +/* commands passed to the device */
14161 +#define DLM_USER_LOCK       1
14162 +#define DLM_USER_UNLOCK     2
14163 +#define DLM_USER_QUERY      3
14164 +
14165 +/* Arbitrary length restriction */
14166 +#define MAX_LS_NAME_LEN 64
14167 +
14168 +/* ioctls on the device */
14169 +#define DLM_CREATE_LOCKSPACE         _IOW('D', 0x01, char *)
14170 +#define DLM_RELEASE_LOCKSPACE        _IOW('D', 0x02, char *)
14171 +#define DLM_FORCE_RELEASE_LOCKSPACE  _IOW('D', 0x03, char *)