]>
Commit | Line | Data |
---|---|---|
4bf12011 | 1 | # Add DLM to the build system |
2 | diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig | |
3 | --- linux-2.6.7/cluster/Kconfig 2004-06-17 15:00:36.000000000 +0800 | |
4 | +++ linux/cluster/Kconfig 2004-06-17 15:00:57.000000000 +0800 | |
5 | @@ -10,4 +10,22 @@ config CLUSTER | |
6 | needed by all the other components. It provides membership services | |
7 | for those other subsystems. | |
8 | ||
9 | +config CLUSTER_DLM | |
10 | + tristate "Distributed Lock Manager" | |
11 | + depends on CLUSTER | |
12 | + ---help--- | |
13 | + A fully distributed lock manager, providing cluster-wide locking services | |
14 | + and protected lock namespaces for kernel and userland applications. | |
15 | + | |
16 | +config CLUSTER_DLM_PROCLOCKS | |
17 | + boolean "/proc/locks support for DLM" | |
18 | + depends on CLUSTER_DLM | |
19 | + depends on PROC_FS | |
20 | + ---help--- | |
21 | + If this option is enabled a file will appear in /proc/cluster/dlm_locks. | |
22 | + write into this "file" the name of a lockspace known to the DLM and then | |
23 | + read out a list of all the resources and locks in that lockspace that are | |
24 | + known to the local node. Note because the DLM is distributed this may not | |
25 | + be the full lock picture. | |
26 | + | |
27 | endmenu | |
28 | diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile | |
29 | --- linux-2.6.7/cluster/Makefile 2004-06-17 15:00:36.000000000 +0800 | |
30 | +++ linux/cluster/Makefile 2004-06-17 15:00:57.000000000 +0800 | |
31 | @@ -1,3 +1,4 @@ | |
32 | obj-y := nocluster.o | |
33 | ||
34 | obj-$(CONFIG_CLUSTER) += cman/ | |
35 | +obj-$(CONFIG_CLUSTER_DLM) += dlm/ | |
36 | diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile | |
37 | --- linux-2.6.7/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730 | |
38 | +++ linux/cluster/dlm/Makefile 2004-06-17 15:00:57.000000000 +0800 | |
39 | @@ -0,0 +1,23 @@ | |
40 | +dlm-objs := ast.o \ | |
41 | + config.o \ | |
42 | + device.o \ | |
43 | + dir.o \ | |
44 | + lkb.o \ | |
45 | + locking.o \ | |
46 | + lockqueue.o \ | |
47 | + lockspace.o \ | |
48 | + lowcomms.o \ | |
49 | + main.o \ | |
50 | + memory.o \ | |
51 | + midcomms.o \ | |
52 | + nodes.o \ | |
53 | + proc.o \ | |
54 | + queries.o \ | |
55 | + rebuild.o \ | |
56 | + reccomms.o \ | |
57 | + recover.o \ | |
58 | + recoverd.o \ | |
59 | + rsb.o \ | |
60 | + util.o \ | |
61 | + | |
62 | +obj-$(CONFIG_CLUSTER_DLM) += dlm.o | |
63 | diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c | |
64 | --- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b AM |
65 | +++ linux-patched/cluster/dlm/ast.c 2004-06-29 20:01:19.000000000 +0800 |
66 | @@ -0,0 +1,560 @@ | |
4bf12011 | 67 | +/****************************************************************************** |
68 | +******************************************************************************* | |
69 | +** | |
70 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
71 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
72 | +** | |
73 | +** This copyrighted material is made available to anyone wishing to use, | |
74 | +** modify, copy, or redistribute it subject to the terms and conditions | |
75 | +** of the GNU General Public License v.2. | |
76 | +** | |
77 | +******************************************************************************* | |
78 | +******************************************************************************/ | |
79 | + | |
80 | +/* | |
81 | + * This delivers ASTs and checks for dead remote requests and deadlocks. | |
82 | + */ | |
83 | + | |
84 | +#include <linux/timer.h> | |
85 | + | |
86 | +#include "dlm_internal.h" | |
87 | +#include "rsb.h" | |
88 | +#include "lockqueue.h" | |
89 | +#include "dir.h" | |
90 | +#include "locking.h" | |
91 | +#include "lkb.h" | |
92 | +#include "lowcomms.h" | |
93 | +#include "midcomms.h" | |
94 | +#include "ast.h" | |
95 | +#include "nodes.h" | |
96 | +#include "config.h" | |
97 | + | |
98 | +/* Wake up flags for astd */ | |
99 | +#define GDLMD_WAKE_ASTS 1 | |
100 | +#define GDLMD_WAKE_TIMER 2 | |
101 | + | |
102 | +static struct list_head _deadlockqueue; | |
103 | +static struct semaphore _deadlockqueue_lock; | |
104 | +static struct list_head _lockqueue; | |
105 | +static struct semaphore _lockqueue_lock; | |
106 | +static struct timer_list _lockqueue_timer; | |
107 | +static struct list_head _ast_queue; | |
108 | +static struct semaphore _ast_queue_lock; | |
109 | +static wait_queue_head_t _astd_waitchan; | |
110 | +static atomic_t _astd_running; | |
111 | +static long _astd_pid; | |
112 | +static unsigned long _astd_wakeflags; | |
113 | +static struct completion _astd_done; | |
114 | + | |
115 | +void add_to_lockqueue(gd_lkb_t *lkb) | |
116 | +{ | |
117 | + /* Time stamp the entry so we know if it's been waiting too long */ | |
118 | + lkb->lkb_lockqueue_time = jiffies; | |
119 | + | |
120 | + down(&_lockqueue_lock); | |
121 | + list_add(&lkb->lkb_lockqueue, &_lockqueue); | |
122 | + up(&_lockqueue_lock); | |
123 | +} | |
124 | + | |
125 | +void remove_from_lockqueue(gd_lkb_t *lkb) | |
126 | +{ | |
127 | + down(&_lockqueue_lock); | |
128 | + list_del(&lkb->lkb_lockqueue); | |
129 | + up(&_lockqueue_lock); | |
130 | +} | |
131 | + | |
132 | +void add_to_deadlockqueue(gd_lkb_t *lkb) | |
133 | +{ | |
134 | + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags)) | |
135 | + return; | |
136 | + lkb->lkb_duetime = jiffies; | |
137 | + down(&_deadlockqueue_lock); | |
138 | + list_add(&lkb->lkb_deadlockq, &_deadlockqueue); | |
139 | + up(&_deadlockqueue_lock); | |
140 | +} | |
141 | + | |
142 | +void remove_from_deadlockqueue(gd_lkb_t *lkb) | |
143 | +{ | |
144 | + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags)) | |
145 | + return; | |
146 | + | |
147 | + down(&_deadlockqueue_lock); | |
148 | + list_del(&lkb->lkb_deadlockq); | |
149 | + up(&_deadlockqueue_lock); | |
150 | + | |
151 | + /* Invalidate the due time */ | |
152 | + memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime)); | |
153 | +} | |
154 | + | |
4bf12011 | 155 | +/* |
5cdbd17b | 156 | + * deliver an AST to a user |
4bf12011 | 157 | + */ |
158 | + | |
5cdbd17b | 159 | +static void deliver_ast(gd_lkb_t *lkb, uint16_t ast_type) |
4bf12011 | 160 | +{ |
161 | + void (*cast) (long param) = lkb->lkb_astaddr; | |
162 | + void (*bast) (long param, int mode) = lkb->lkb_bastaddr; | |
163 | + | |
5cdbd17b AM |
164 | + if (ast_type == AST_BAST) { |
165 | + if (!bast) | |
166 | + return; | |
167 | + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) | |
168 | + return; | |
4bf12011 | 169 | + bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode); |
5cdbd17b AM |
170 | + } else { |
171 | + if (!cast) | |
172 | + return; | |
173 | + cast(lkb->lkb_astparam); | |
4bf12011 | 174 | + } |
4bf12011 | 175 | +} |
176 | + | |
177 | +/* | |
178 | + * Queue an AST for delivery, this will only deal with | |
179 | + * kernel ASTs, usermode API will piggyback on top of this. | |
180 | + * | |
181 | + * This can be called in either the user or DLM context. | |
182 | + * ASTs are queued EVEN IF we are already running in gdlm_astd | |
183 | + * context as we don't know what other locks are held (eg we could | |
184 | + * be being called from a lock operation that was called from | |
185 | + * another AST! | |
186 | + * If the AST is to be queued remotely then a message is sent to | |
187 | + * the target system via midcomms. | |
188 | + */ | |
189 | + | |
5cdbd17b | 190 | +void queue_ast(gd_lkb_t *lkb, uint16_t flags, uint8_t rqmode) |
4bf12011 | 191 | +{ |
192 | + struct gd_remlockrequest req; | |
193 | + | |
194 | + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) { | |
195 | + /* | |
196 | + * Send a message to have an ast queued remotely. Note: we do | |
197 | + * not send remote completion asts, they are handled as part of | |
198 | + * remote lock granting. | |
199 | + */ | |
5cdbd17b | 200 | + if (flags & AST_BAST) { |
4bf12011 | 201 | + req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST; |
202 | + req.rr_header.rh_length = sizeof(req); | |
203 | + req.rr_header.rh_flags = 0; | |
204 | + req.rr_header.rh_lkid = lkb->lkb_id; | |
205 | + req.rr_header.rh_lockspace = | |
206 | + lkb->lkb_resource->res_ls->ls_global_id; | |
207 | + req.rr_status = lkb->lkb_retstatus; | |
208 | + req.rr_remlkid = lkb->lkb_remid; | |
209 | + req.rr_rqmode = rqmode; | |
210 | + | |
211 | + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header, | |
5cdbd17b | 212 | + lkb->lkb_resource->res_ls->ls_allocation); |
4bf12011 | 213 | + } else if (lkb->lkb_retstatus == -EDEADLOCK) { |
214 | + /* | |
215 | + * We only queue remote Completion ASTs here for error | |
216 | + * completions that happen out of band. | |
217 | + * DEADLOCK is one such. | |
218 | + */ | |
4bf12011 | 219 | + req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST; |
220 | + req.rr_header.rh_length = sizeof(req); | |
221 | + req.rr_header.rh_flags = 0; | |
222 | + req.rr_header.rh_lkid = lkb->lkb_id; | |
223 | + req.rr_header.rh_lockspace = | |
224 | + lkb->lkb_resource->res_ls->ls_global_id; | |
225 | + req.rr_status = lkb->lkb_retstatus; | |
226 | + req.rr_remlkid = lkb->lkb_remid; | |
227 | + req.rr_rqmode = rqmode; | |
228 | + | |
229 | + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header, | |
5cdbd17b | 230 | + lkb->lkb_resource->res_ls->ls_allocation); |
4bf12011 | 231 | + } |
232 | + } else { | |
233 | + /* | |
5cdbd17b | 234 | + * Prepare info that will be returned in ast/bast. |
4bf12011 | 235 | + */ |
236 | + | |
5cdbd17b | 237 | + if (flags & AST_BAST) { |
4bf12011 | 238 | + lkb->lkb_bastmode = rqmode; |
239 | + } else { | |
240 | + lkb->lkb_lksb->sb_status = lkb->lkb_retstatus; | |
241 | + | |
242 | + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) | |
243 | + lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED; | |
244 | + else | |
245 | + lkb->lkb_lksb->sb_flags = 0; | |
246 | + } | |
247 | + | |
4bf12011 | 248 | + down(&_ast_queue_lock); |
5cdbd17b AM |
249 | + if (lkb->lkb_astflags & AST_DEL) |
250 | + log_print("queue_ast on deleted lkb %x ast %x pid %u", | |
251 | + lkb->lkb_id, lkb->lkb_astflags, current->pid); | |
252 | + if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST))) | |
4bf12011 | 253 | + list_add_tail(&lkb->lkb_astqueue, &_ast_queue); |
5cdbd17b | 254 | + lkb->lkb_astflags |= flags; |
4bf12011 | 255 | + up(&_ast_queue_lock); |
256 | + | |
257 | + /* It is the responsibility of the caller to call wake_astd() | |
258 | + * after it has finished other locking operations that request | |
259 | + * the ASTs to be delivered after */ | |
260 | + } | |
261 | +} | |
262 | + | |
263 | +/* | |
5cdbd17b | 264 | + * Process any LKBs on the AST queue. |
4bf12011 | 265 | + */ |
266 | + | |
267 | +static void process_asts(void) | |
268 | +{ | |
5cdbd17b AM |
269 | + gd_lkb_t *lkb; |
270 | + uint16_t flags; | |
4bf12011 | 271 | + |
5cdbd17b AM |
272 | + for (;;) { |
273 | + down(&_ast_queue_lock); | |
274 | + if (list_empty(&_ast_queue)) { | |
275 | + up(&_ast_queue_lock); | |
276 | + break; | |
277 | + } | |
278 | + | |
279 | + lkb = list_entry(_ast_queue.next, gd_lkb_t, lkb_astqueue); | |
280 | + list_del(&lkb->lkb_astqueue); | |
281 | + flags = lkb->lkb_astflags; | |
282 | + lkb->lkb_astflags = 0; | |
283 | + up(&_ast_queue_lock); | |
4bf12011 | 284 | + |
5cdbd17b AM |
285 | + if (flags & AST_COMP) |
286 | + deliver_ast(lkb, AST_COMP); | |
4bf12011 | 287 | + |
5cdbd17b AM |
288 | + if (flags & AST_BAST) { |
289 | + if (flags & AST_DEL) | |
290 | + log_print("skip bast on %x", lkb->lkb_id); | |
291 | + else | |
292 | + deliver_ast(lkb, AST_BAST); | |
293 | + } | |
4bf12011 | 294 | + |
5cdbd17b AM |
295 | + if (flags & AST_DEL) { |
296 | + gd_res_t *rsb = lkb->lkb_resource; | |
297 | + gd_ls_t *ls = rsb->res_ls; | |
4bf12011 | 298 | + |
5cdbd17b AM |
299 | + GDLM_ASSERT(lkb->lkb_astflags == 0, |
300 | + printk("%x %x\n", lkb->lkb_id, lkb->lkb_astflags);); | |
4bf12011 | 301 | + |
5cdbd17b AM |
302 | + down_read(&ls->ls_in_recovery); |
303 | + release_lkb(ls, lkb); | |
304 | + release_rsb(rsb); | |
305 | + up_read(&ls->ls_in_recovery); | |
306 | + } | |
307 | + | |
308 | + schedule(); | |
4bf12011 | 309 | + } |
4bf12011 | 310 | +} |
311 | + | |
312 | +void lockqueue_lkb_mark(gd_ls_t *ls) | |
313 | +{ | |
314 | + gd_lkb_t *lkb, *safe; | |
315 | + int count = 0; | |
316 | + | |
317 | + log_all(ls, "mark waiting requests"); | |
318 | + | |
319 | + down(&_lockqueue_lock); | |
320 | + | |
321 | + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) { | |
322 | + | |
323 | + if (lkb->lkb_resource->res_ls != ls) | |
324 | + continue; | |
325 | + | |
326 | + /* | |
327 | + * These lkb's are new and the master is being looked up. Mark | |
328 | + * the lkb request to be resent. Even if the destination node | |
329 | + * for the request is still living and has our request, it will | |
330 | + * purge all resdir requests in purge_requestqueue. If there's | |
331 | + * a reply to the LOOKUP request in our requestqueue (the reply | |
332 | + * arrived after ls_stop), it is invalid and will be discarded | |
333 | + * in purge_requestqueue, too. | |
334 | + */ | |
335 | + | |
336 | + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) { | |
337 | + GDLM_ASSERT(lkb->lkb_nodeid == -1, | |
338 | + log_error(ls, "nodeid=%d\n", | |
339 | + lkb->lkb_nodeid);); | |
340 | + | |
341 | + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND; | |
342 | + count++; | |
343 | + continue; | |
344 | + } | |
345 | + | |
346 | + /* | |
347 | + * These lkb's have an outstanding request to a bygone node. | |
348 | + * The request will be redirected to the new master node in | |
349 | + * resend_cluster_requests(). Don't mark the request for | |
350 | + * resending if there's a reply for it saved in the | |
351 | + * requestqueue. | |
352 | + */ | |
353 | + | |
354 | + if (in_nodes_gone(ls, lkb->lkb_nodeid) && | |
355 | + !reply_in_requestqueue(ls, lkb->lkb_id)) { | |
356 | + | |
357 | + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND; | |
358 | + | |
359 | + /* | |
360 | + * Don't rebuild this lkb on a new rsb in | |
361 | + * rebuild_rsbs_send(). | |
362 | + */ | |
363 | + | |
364 | + if (lkb->lkb_lockqueue_state == | |
365 | + GDLM_LQSTATE_WAIT_CONDGRANT) { | |
366 | + GDLM_ASSERT(lkb->lkb_status == | |
367 | + GDLM_LKSTS_WAITING, ); | |
368 | + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD; | |
369 | + } | |
370 | + | |
371 | + /* | |
372 | + * This flag indicates to the new master that his lkb | |
373 | + * is in the midst of a convert request and should be | |
374 | + * placed on the granted queue rather than the convert | |
375 | + * queue. We will resend this convert request to the | |
376 | + * new master. | |
377 | + */ | |
378 | + | |
379 | + else if (lkb->lkb_lockqueue_state == | |
380 | + GDLM_LQSTATE_WAIT_CONVERT) { | |
381 | + GDLM_ASSERT(lkb->lkb_status == | |
382 | + GDLM_LKSTS_CONVERT, ); | |
383 | + lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT; | |
384 | + } | |
385 | + | |
386 | + count++; | |
387 | + } | |
388 | + } | |
389 | + up(&_lockqueue_lock); | |
390 | + | |
391 | + log_all(ls, "marked %d requests", count); | |
392 | +} | |
393 | + | |
394 | +int resend_cluster_requests(gd_ls_t *ls) | |
395 | +{ | |
396 | + gd_lkb_t *lkb, *safe; | |
397 | + int error = 0, state, count = 0; | |
398 | + | |
399 | + log_all(ls, "resend marked requests"); | |
400 | + | |
401 | + down(&_lockqueue_lock); | |
402 | + | |
403 | + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) { | |
404 | + | |
405 | + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { | |
406 | + log_debug(ls, "resend_cluster_requests: aborted"); | |
407 | + error = -EINTR; | |
408 | + break; | |
409 | + } | |
410 | + | |
411 | + if (lkb->lkb_resource->res_ls != ls) | |
412 | + continue; | |
413 | + | |
414 | + log_debug(ls, "resend_cluster_requests id=%x nodeid=%d " | |
415 | + "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid, | |
416 | + lkb->lkb_lockqueue_state, lkb->lkb_flags); | |
417 | + | |
418 | + /* | |
419 | + * Resend/process the lockqueue lkb's (in-progres requests) | |
420 | + * that were flagged at the start of recovery in | |
421 | + * lockqueue_lkb_mark(). | |
422 | + */ | |
423 | + | |
424 | + if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) { | |
425 | + lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND; | |
426 | + lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD; | |
427 | + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT; | |
428 | + | |
429 | + if (lkb->lkb_nodeid == -1) { | |
430 | + /* | |
431 | + * Send lookup to new resdir node. | |
432 | + */ | |
433 | + lkb->lkb_lockqueue_time = jiffies; | |
434 | + send_cluster_request(lkb, | |
435 | + lkb->lkb_lockqueue_state); | |
436 | + } | |
437 | + | |
438 | + else if (lkb->lkb_nodeid != 0) { | |
439 | + /* | |
440 | + * There's a new RSB master (that's not us.) | |
441 | + */ | |
442 | + lkb->lkb_lockqueue_time = jiffies; | |
443 | + send_cluster_request(lkb, | |
444 | + lkb->lkb_lockqueue_state); | |
445 | + } | |
446 | + | |
447 | + else { | |
448 | + /* | |
449 | + * We are the new RSB master for this lkb | |
450 | + * request. | |
451 | + */ | |
452 | + state = lkb->lkb_lockqueue_state; | |
453 | + lkb->lkb_lockqueue_state = 0; | |
454 | + /* list_del equals remove_from_lockqueue() */ | |
455 | + list_del(&lkb->lkb_lockqueue); | |
456 | + process_remastered_lkb(lkb, state); | |
457 | + } | |
458 | + | |
459 | + count++; | |
460 | + } | |
461 | + } | |
462 | + up(&_lockqueue_lock); | |
463 | + | |
464 | + log_all(ls, "resent %d requests", count); | |
465 | + return error; | |
466 | +} | |
467 | + | |
468 | +/* | |
469 | + * Process any LKBs on the Lock queue, this | |
470 | + * just looks at the entries to see if they have been | |
471 | + * on the queue too long and fails the requests if so. | |
472 | + */ | |
473 | + | |
474 | +static void process_lockqueue(void) | |
475 | +{ | |
476 | + gd_lkb_t *lkb, *safe; | |
477 | + gd_ls_t *ls; | |
478 | + int count = 0; | |
479 | + | |
480 | + down(&_lockqueue_lock); | |
481 | + | |
482 | + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) { | |
483 | + ls = lkb->lkb_resource->res_ls; | |
484 | + | |
485 | + if (test_bit(LSFL_NOTIMERS, &ls->ls_flags)) | |
486 | + continue; | |
487 | + | |
488 | + /* Don't time out locks that are in transition */ | |
489 | + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) | |
490 | + continue; | |
491 | + | |
492 | + if (check_timeout(lkb->lkb_lockqueue_time, | |
493 | + dlm_config.lock_timeout)) { | |
494 | + count++; | |
495 | + list_del(&lkb->lkb_lockqueue); | |
496 | + up(&_lockqueue_lock); | |
497 | + cancel_lockop(lkb, -ETIMEDOUT); | |
498 | + down(&_lockqueue_lock); | |
499 | + } | |
500 | + } | |
501 | + up(&_lockqueue_lock); | |
502 | + | |
503 | + if (count) | |
504 | + wake_astd(); | |
505 | + | |
506 | + if (atomic_read(&_astd_running)) | |
507 | + mod_timer(&_lockqueue_timer, | |
508 | + jiffies + ((dlm_config.lock_timeout >> 1) * HZ)); | |
509 | +} | |
510 | + | |
511 | +/* Look for deadlocks */ | |
512 | +static void process_deadlockqueue(void) | |
513 | +{ | |
514 | + gd_lkb_t *lkb, *safe; | |
515 | + | |
516 | + down(&_deadlockqueue_lock); | |
517 | + | |
518 | + list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) { | |
519 | + gd_lkb_t *kill_lkb; | |
520 | + | |
521 | + /* Only look at "due" locks */ | |
522 | + if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime)) | |
523 | + break; | |
524 | + | |
525 | + /* Don't look at locks that are in transition */ | |
526 | + if (!test_bit(LSFL_LS_RUN, | |
527 | + &lkb->lkb_resource->res_ls->ls_flags)) | |
528 | + continue; | |
529 | + | |
530 | + up(&_deadlockqueue_lock); | |
531 | + | |
532 | + /* Lock has hit due time, check for conversion deadlock */ | |
533 | + kill_lkb = conversion_deadlock_check(lkb); | |
534 | + if (kill_lkb) | |
535 | + cancel_conversion(kill_lkb, -EDEADLOCK); | |
536 | + | |
537 | + down(&_deadlockqueue_lock); | |
538 | + } | |
539 | + up(&_deadlockqueue_lock); | |
540 | +} | |
541 | + | |
542 | +static __inline__ int no_asts(void) | |
543 | +{ | |
544 | + int ret; | |
545 | + | |
546 | + down(&_ast_queue_lock); | |
547 | + ret = list_empty(&_ast_queue); | |
548 | + up(&_ast_queue_lock); | |
549 | + return ret; | |
550 | +} | |
551 | + | |
552 | +static void lockqueue_timer_fn(unsigned long arg) | |
553 | +{ | |
554 | + set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags); | |
555 | + wake_up(&_astd_waitchan); | |
556 | +} | |
557 | + | |
558 | +/* | |
559 | + * DLM daemon which delivers asts. | |
560 | + */ | |
561 | + | |
562 | +static int dlm_astd(void *data) | |
563 | +{ | |
564 | + daemonize("dlm_astd"); | |
565 | + | |
566 | + INIT_LIST_HEAD(&_lockqueue); | |
567 | + init_MUTEX(&_lockqueue_lock); | |
568 | + INIT_LIST_HEAD(&_deadlockqueue); | |
569 | + init_MUTEX(&_deadlockqueue_lock); | |
570 | + INIT_LIST_HEAD(&_ast_queue); | |
571 | + init_MUTEX(&_ast_queue_lock); | |
572 | + init_waitqueue_head(&_astd_waitchan); | |
573 | + complete(&_astd_done); | |
574 | + | |
575 | + /* | |
576 | + * Set a timer to check the lockqueue for dead locks (and deadlocks). | |
577 | + */ | |
578 | + | |
579 | + init_timer(&_lockqueue_timer); | |
580 | + _lockqueue_timer.function = lockqueue_timer_fn; | |
581 | + _lockqueue_timer.data = 0; | |
582 | + mod_timer(&_lockqueue_timer, | |
583 | + jiffies + ((dlm_config.lock_timeout >> 1) * HZ)); | |
584 | + | |
585 | + while (atomic_read(&_astd_running)) { | |
586 | + wchan_cond_sleep_intr(_astd_waitchan, no_asts()); | |
587 | + | |
588 | + if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags)) | |
589 | + process_asts(); | |
590 | + | |
591 | + if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) { | |
592 | + process_lockqueue(); | |
593 | + if (dlm_config.deadlocktime) | |
594 | + process_deadlockqueue(); | |
595 | + } | |
596 | + } | |
597 | + | |
598 | + if (timer_pending(&_lockqueue_timer)) | |
599 | + del_timer(&_lockqueue_timer); | |
600 | + | |
601 | + complete(&_astd_done); | |
602 | + | |
603 | + return 0; | |
604 | +} | |
605 | + | |
606 | +void wake_astd(void) | |
607 | +{ | |
608 | + set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags); | |
609 | + wake_up(&_astd_waitchan); | |
610 | +} | |
611 | + | |
612 | +int astd_start() | |
613 | +{ | |
614 | + init_completion(&_astd_done); | |
615 | + atomic_set(&_astd_running, 1); | |
616 | + _astd_pid = kernel_thread(dlm_astd, NULL, 0); | |
617 | + wait_for_completion(&_astd_done); | |
618 | + return 0; | |
619 | +} | |
620 | + | |
621 | +void astd_stop() | |
622 | +{ | |
623 | + atomic_set(&_astd_running, 0); | |
624 | + wake_astd(); | |
625 | + wait_for_completion(&_astd_done); | |
626 | +} | |
627 | diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h | |
628 | --- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b AM |
629 | +++ linux-patched/cluster/dlm/ast.h 2004-06-29 20:01:19.000000000 +0800 |
630 | @@ -0,0 +1,28 @@ | |
4bf12011 | 631 | +/****************************************************************************** |
632 | +******************************************************************************* | |
633 | +** | |
634 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
635 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
636 | +** | |
637 | +** This copyrighted material is made available to anyone wishing to use, | |
638 | +** modify, copy, or redistribute it subject to the terms and conditions | |
639 | +** of the GNU General Public License v.2. | |
640 | +** | |
641 | +******************************************************************************* | |
642 | +******************************************************************************/ | |
643 | + | |
644 | +#ifndef __AST_DOT_H__ | |
645 | +#define __AST_DOT_H__ | |
646 | + | |
5cdbd17b AM |
647 | +void lockqueue_lkb_mark(gd_ls_t *ls); |
648 | +int resend_cluster_requests(gd_ls_t *ls); | |
649 | +void add_to_lockqueue(gd_lkb_t *lkb); | |
650 | +void remove_from_lockqueue(gd_lkb_t *lkb); | |
651 | +void add_to_deadlockqueue(gd_lkb_t *lkb); | |
652 | +void remove_from_deadlockqueue(gd_lkb_t *lkb); | |
653 | +void queue_ast(gd_lkb_t *lkb, uint16_t astflags, uint8_t rqmode); | |
4bf12011 | 654 | +void wake_astd(void); |
655 | +int astd_start(void); | |
656 | +void astd_stop(void); | |
657 | + | |
658 | +#endif /* __AST_DOT_H__ */ | |
659 | diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c | |
660 | --- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 661 | +++ linux-patched/cluster/dlm/config.c 2004-06-29 20:01:19.000000000 +0800 |
4bf12011 | 662 | @@ -0,0 +1,125 @@ |
663 | +/****************************************************************************** | |
664 | +******************************************************************************* | |
665 | +** | |
666 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
667 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
668 | +** | |
669 | +** This copyrighted material is made available to anyone wishing to use, | |
670 | +** modify, copy, or redistribute it subject to the terms and conditions | |
671 | +** of the GNU General Public License v.2. | |
672 | +** | |
673 | +******************************************************************************* | |
674 | +******************************************************************************/ | |
675 | + | |
676 | +#include <linux/module.h> | |
677 | +#include <linux/proc_fs.h> | |
678 | + | |
679 | +#include "dlm_internal.h" | |
680 | +#include "lowcomms.h" | |
681 | +#include "config.h" | |
682 | + | |
683 | +/* Config file defaults */ | |
684 | +#define DEFAULT_TCP_PORT 21064 | |
685 | +#define DEFAULT_LOCK_TIMEOUT 30 | |
686 | +#define DEFAULT_BUFFER_SIZE 4096 | |
687 | +#define DEFAULT_RESHASHTBL 256 | |
688 | +#define DEFAULT_LOCKIDTBL 1024 | |
689 | +#define DEFAULT_MAX_CONNECTIONS 128 | |
690 | +#define DEFAULT_DEADLOCKTIME 10 | |
691 | + | |
692 | +struct config_info dlm_config = { | |
693 | + .tcp_port = DEFAULT_TCP_PORT, | |
694 | + .lock_timeout = DEFAULT_LOCK_TIMEOUT, | |
695 | + .buffer_size = DEFAULT_BUFFER_SIZE, | |
696 | + .reshashtbl = DEFAULT_RESHASHTBL, | |
697 | + .lockidtbl = DEFAULT_LOCKIDTBL, | |
698 | + .max_connections = DEFAULT_MAX_CONNECTIONS, | |
699 | + .deadlocktime = DEFAULT_DEADLOCKTIME, | |
700 | +}; | |
701 | + | |
702 | + | |
703 | +static struct config_proc_info { | |
704 | + char *name; | |
705 | + int *value; | |
706 | +} config_proc[] = { | |
707 | + { | |
708 | + .name = "tcp_port", | |
709 | + .value = &dlm_config.tcp_port, | |
710 | + }, | |
711 | + { | |
712 | + .name = "lock_timeout", | |
713 | + .value = &dlm_config.lock_timeout, | |
714 | + }, | |
715 | + { | |
716 | + .name = "buffer_size", | |
717 | + .value = &dlm_config.buffer_size, | |
718 | + }, | |
719 | + { | |
720 | + .name = "reshashtbl", | |
721 | + .value = &dlm_config.reshashtbl, | |
722 | + }, | |
723 | + { | |
724 | + .name = "lockidtbl", | |
725 | + .value = &dlm_config.lockidtbl, | |
726 | + }, | |
727 | + { | |
728 | + .name = "max_connections", | |
729 | + .value = &dlm_config.max_connections, | |
730 | + }, | |
731 | + { | |
732 | + .name = "deadlocktime", | |
733 | + .value = &dlm_config.deadlocktime, | |
734 | + }, | |
735 | +}; | |
736 | +static struct proc_dir_entry *dlm_dir; | |
737 | + | |
738 | +static int dlm_config_read_proc(char *page, char **start, off_t off, int count, | |
739 | + int *eof, void *data) | |
740 | +{ | |
741 | + struct config_proc_info *cinfo = data; | |
742 | + return snprintf(page, count, "%d\n", *cinfo->value); | |
743 | +} | |
744 | + | |
745 | +static int dlm_config_write_proc(struct file *file, const char *buffer, | |
746 | + unsigned long count, void *data) | |
747 | +{ | |
748 | + struct config_proc_info *cinfo = data; | |
749 | + int value; | |
750 | + char *end; | |
751 | + | |
752 | + value = simple_strtoul(buffer, &end, 10); | |
753 | + if (*end) | |
754 | + *cinfo->value = value; | |
755 | + return count; | |
756 | +} | |
757 | + | |
758 | +int dlm_config_init(void) | |
759 | +{ | |
760 | + int i; | |
761 | + struct proc_dir_entry *pde; | |
762 | + | |
763 | + dlm_dir = proc_mkdir("cluster/config/dlm", 0); | |
764 | + if (!dlm_dir) | |
765 | + return -1; | |
766 | + | |
767 | + dlm_dir->owner = THIS_MODULE; | |
768 | + | |
769 | + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) { | |
770 | + pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir); | |
771 | + if (pde) { | |
772 | + pde->data = &config_proc[i]; | |
773 | + pde->write_proc = dlm_config_write_proc; | |
774 | + pde->read_proc = dlm_config_read_proc; | |
775 | + } | |
776 | + } | |
777 | + return 0; | |
778 | +} | |
779 | + | |
780 | +void dlm_config_exit(void) | |
781 | +{ | |
782 | + int i; | |
783 | + | |
784 | + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) | |
785 | + remove_proc_entry(config_proc[i].name, dlm_dir); | |
786 | + remove_proc_entry("cluster/config/dlm", NULL); | |
787 | +} | |
788 | diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h | |
789 | --- linux-orig/cluster/dlm/config.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 790 | +++ linux-patched/cluster/dlm/config.h 2004-06-29 20:01:19.000000000 +0800 |
4bf12011 | 791 | @@ -0,0 +1,31 @@ |
792 | +/****************************************************************************** | |
793 | +******************************************************************************* | |
794 | +** | |
795 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
796 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
797 | +** | |
798 | +** This copyrighted material is made available to anyone wishing to use, | |
799 | +** modify, copy, or redistribute it subject to the terms and conditions | |
800 | +** of the GNU General Public License v.2. | |
801 | +** | |
802 | +******************************************************************************* | |
803 | +******************************************************************************/ | |
804 | + | |
805 | +#ifndef __CONFIG_DOT_H__ | |
806 | +#define __CONFIG_DOT_H__ | |
807 | + | |
808 | +struct config_info { | |
809 | + int tcp_port; | |
810 | + int lock_timeout; | |
811 | + int buffer_size; | |
812 | + int reshashtbl; | |
813 | + int lockidtbl; | |
814 | + int max_connections; | |
815 | + int deadlocktime; | |
816 | +}; | |
817 | + | |
818 | +extern struct config_info dlm_config; | |
819 | +extern int dlm_config_init(void); | |
820 | +extern void dlm_config_exit(void); | |
821 | + | |
822 | +#endif /* __CONFIG_DOT_H__ */ | |
823 | diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c | |
824 | --- linux-orig/cluster/dlm/device.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 825 | +++ linux-patched/cluster/dlm/device.c 2004-06-29 20:01:19.000000000 +0800 |
4bf12011 | 826 | @@ -0,0 +1,1020 @@ |
827 | +/****************************************************************************** | |
828 | +******************************************************************************* | |
829 | +** | |
830 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
831 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
832 | +** | |
833 | +** This copyrighted material is made available to anyone wishing to use, | |
834 | +** modify, copy, or redistribute it subject to the terms and conditions | |
835 | +** of the GNU General Public License v.2. | |
836 | +** | |
837 | +******************************************************************************* | |
838 | +******************************************************************************/ | |
839 | + | |
840 | +/* | |
841 | + * device.c | |
842 | + * | |
843 | + * This is the userland interface to the DLM. | |
844 | + * | |
845 | + * The locking is done via a misc char device (find the | |
846 | + * registered minor number in /proc/misc). | |
847 | + * | |
848 | + * User code should not use this interface directly but | |
849 | + * call the library routines in libdlm.a instead. | |
850 | + * | |
851 | + */ | |
852 | + | |
853 | +#include <linux/miscdevice.h> | |
854 | +#include <linux/init.h> | |
855 | +#include <linux/wait.h> | |
856 | +#include <linux/module.h> | |
857 | +#include <linux/file.h> | |
858 | +#include <linux/fs.h> | |
859 | +#include <linux/poll.h> | |
860 | +#include <linux/signal.h> | |
861 | +#include <linux/spinlock.h> | |
862 | +#include <asm/ioctls.h> | |
863 | + | |
864 | +#include "dlm_internal.h" | |
865 | +#include "device.h" | |
866 | + | |
867 | +extern gd_lkb_t *dlm_get_lkb(gd_ls_t *, int); | |
868 | +static struct file_operations _dlm_fops; | |
869 | +static const char *name_prefix="dlm"; | |
870 | +static struct list_head user_ls_list; | |
871 | + | |
872 | +/* Flags in li_flags */ | |
873 | +#define LI_FLAG_COMPLETE 1 | |
874 | +#define LI_FLAG_FIRSTLOCK 2 | |
875 | + | |
876 | +struct lock_info { | |
877 | + uint8_t li_cmd; | |
878 | + struct dlm_lksb li_lksb; | |
879 | + wait_queue_head_t li_waitq; | |
880 | + unsigned long li_flags; | |
881 | + void __user *li_astparam; | |
882 | + void __user *li_astaddr; | |
883 | + void __user *li_bastaddr; | |
884 | + struct file_info *li_file; | |
885 | + struct dlm_lksb __user *li_user_lksb; | |
886 | + struct semaphore li_firstlock; | |
887 | + struct dlm_queryinfo *li_queryinfo; | |
888 | + struct dlm_queryinfo __user *li_user_queryinfo; | |
889 | +}; | |
890 | + | |
891 | +/* A queued AST no less */ | |
892 | +struct ast_info { | |
893 | + struct dlm_lock_result result; | |
894 | + struct dlm_queryinfo *queryinfo; | |
895 | + struct dlm_queryinfo __user *user_queryinfo; | |
896 | + struct list_head list; | |
897 | +}; | |
898 | + | |
899 | +/* One of these per userland lockspace */ | |
900 | +struct user_ls { | |
901 | + void *ls_lockspace; | |
902 | + atomic_t ls_refcnt; | |
903 | + long ls_flags; /* bit 1 means LS has been deleted */ | |
904 | + | |
905 | + /* Passed into misc_register() */ | |
906 | + struct miscdevice ls_miscinfo; | |
907 | + struct list_head ls_list; | |
908 | +}; | |
909 | + | |
910 | +/* misc_device info for the control device */ | |
911 | +static struct miscdevice ctl_device; | |
912 | + | |
913 | +/* | |
914 | + * Stuff we hang off the file struct. | |
915 | + * The first two are to cope with unlocking all the | |
916 | + * locks help by a process when it dies. | |
917 | + */ | |
918 | +struct file_info { | |
919 | + struct list_head fi_lkb_list; /* List of active lkbs */ | |
920 | + spinlock_t fi_lkb_lock; | |
921 | + struct list_head fi_ast_list; /* Queue of ASTs to be delivered */ | |
922 | + spinlock_t fi_ast_lock; | |
923 | + wait_queue_head_t fi_wait; | |
924 | + struct user_ls *fi_ls; | |
925 | + atomic_t fi_refcnt; /* Number of users */ | |
926 | + unsigned long fi_flags; /* Bit 1 means the device is open */ | |
927 | +}; | |
928 | + | |
929 | + | |
930 | +/* get and put ops for file_info. | |
931 | + Actually I don't really like "get" and "put", but everyone | |
932 | + else seems to use them and I can't think of anything | |
933 | + nicer at the moment */ | |
934 | +static void get_file_info(struct file_info *f) | |
935 | +{ | |
936 | + atomic_inc(&f->fi_refcnt); | |
937 | +} | |
938 | + | |
939 | +static void put_file_info(struct file_info *f) | |
940 | +{ | |
941 | + if (atomic_dec_and_test(&f->fi_refcnt)) | |
942 | + kfree(f); | |
943 | +} | |
944 | + | |
945 | +/* Find a lockspace struct given the device minor number */ | |
946 | +static struct user_ls *find_lockspace(int minor) | |
947 | +{ | |
948 | + struct user_ls *lsinfo; | |
949 | + | |
950 | + list_for_each_entry(lsinfo, &user_ls_list, ls_list) { | |
951 | + | |
952 | + if (lsinfo->ls_miscinfo.minor == minor) | |
953 | + return lsinfo; | |
954 | + } | |
955 | + return NULL; | |
956 | +} | |
957 | + | |
958 | +static void add_lockspace_to_list(struct user_ls *lsinfo) | |
959 | +{ | |
960 | + list_add(&lsinfo->ls_list, &user_ls_list); | |
961 | +} | |
962 | + | |
963 | +/* Register a lockspace with the DLM and create a misc | |
964 | + device for userland to access it */ | |
965 | +static int register_lockspace(char *name, struct user_ls **ls) | |
966 | +{ | |
967 | + struct user_ls *newls; | |
968 | + int status; | |
969 | + int namelen; | |
970 | + | |
971 | + namelen = strlen(name)+strlen(name_prefix)+2; | |
972 | + | |
973 | + newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL); | |
974 | + if (!newls) | |
975 | + return -ENOMEM; | |
976 | + memset(newls, 0, sizeof(struct user_ls)); | |
977 | + | |
978 | + newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL); | |
979 | + if (!newls->ls_miscinfo.name) { | |
980 | + kfree(newls); | |
981 | + return -ENOMEM; | |
982 | + } | |
983 | + snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name); | |
984 | + | |
985 | + status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1, | |
986 | + strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1, | |
987 | + &newls->ls_lockspace, 0); | |
988 | + | |
989 | + if (status != 0) { | |
990 | + kfree(newls->ls_miscinfo.name); | |
991 | + kfree(newls); | |
992 | + return status; | |
993 | + } | |
994 | + | |
995 | + newls->ls_miscinfo.fops = &_dlm_fops; | |
996 | + newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR; | |
997 | + | |
998 | + status = misc_register(&newls->ls_miscinfo); | |
999 | + if (status) { | |
1000 | + log_print("failed to register misc device for %s", name); | |
1001 | + dlm_release_lockspace(newls->ls_lockspace, 0); | |
1002 | + kfree(newls->ls_miscinfo.name); | |
1003 | + kfree(newls); | |
1004 | + return status; | |
1005 | + } | |
1006 | + | |
1007 | + | |
1008 | + add_lockspace_to_list(newls); | |
1009 | + *ls = newls; | |
1010 | + return 0; | |
1011 | +} | |
1012 | + | |
1013 | +static int unregister_lockspace(struct user_ls *lsinfo, int force) | |
1014 | +{ | |
1015 | + int status; | |
1016 | + | |
1017 | + status = dlm_release_lockspace(lsinfo->ls_lockspace, force); | |
1018 | + if (status) | |
1019 | + return status; | |
1020 | + | |
1021 | + status = misc_deregister(&lsinfo->ls_miscinfo); | |
1022 | + if (status) | |
1023 | + return status; | |
1024 | + | |
1025 | + list_del(&lsinfo->ls_list); | |
1026 | + kfree(lsinfo->ls_miscinfo.name); | |
1027 | + kfree(lsinfo); | |
1028 | + | |
1029 | + return 0; | |
1030 | +} | |
1031 | + | |
1032 | +/* Add it to userland's AST queue */ | |
1033 | +static void add_to_astqueue(struct lock_info *li, void *astaddr) | |
1034 | +{ | |
1035 | + struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL); | |
1036 | + if (!ast) | |
1037 | + return; | |
1038 | + | |
1039 | + ast->result.astparam = li->li_astparam; | |
1040 | + ast->result.astaddr = astaddr; | |
1041 | + ast->result.user_lksb = li->li_user_lksb; | |
1042 | + ast->result.cmd = li->li_cmd; | |
1043 | + memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb)); | |
1044 | + | |
1045 | + /* These two will both be NULL for anything other than queries */ | |
1046 | + ast->queryinfo = li->li_queryinfo; | |
1047 | + ast->user_queryinfo = li->li_user_queryinfo; | |
1048 | + | |
1049 | + spin_lock(&li->li_file->fi_ast_lock); | |
1050 | + list_add_tail(&ast->list, &li->li_file->fi_ast_list); | |
1051 | + spin_unlock(&li->li_file->fi_ast_lock); | |
1052 | + wake_up_interruptible(&li->li_file->fi_wait); | |
1053 | +} | |
1054 | + | |
1055 | +static void bast_routine(void *param, int mode) | |
1056 | +{ | |
1057 | + struct lock_info *li = param; | |
1058 | + | |
1059 | + if (param) { | |
1060 | + add_to_astqueue(li, li->li_bastaddr); | |
1061 | + } | |
1062 | +} | |
1063 | + | |
1064 | +/* | |
1065 | + * This is the kernel's AST routine. | |
1066 | + * All lock, unlock & query operations complete here. | |
1067 | + * The only syncronous ops are those done during device close. | |
1068 | + */ | |
1069 | +static void ast_routine(void *param) | |
1070 | +{ | |
1071 | + struct lock_info *li = param; | |
1072 | + | |
1073 | + /* Param may be NULL if a persistent lock is unlocked by someone else */ | |
1074 | + if (!param) | |
1075 | + return; | |
1076 | + | |
1077 | + /* If it's an async request then post data to the user's AST queue. */ | |
1078 | + if (li->li_astaddr) { | |
1079 | + | |
1080 | + /* Only queue AST if the device is still open */ | |
1081 | + if (test_bit(1, &li->li_file->fi_flags)) | |
1082 | + add_to_astqueue(li, li->li_astaddr); | |
1083 | + | |
1084 | + /* If it's a new lock operation that failed, then | |
1085 | + * remove it from the owner queue and free the | |
1086 | + * lock_info. The DLM will not free the LKB until this | |
1087 | + * AST has completed. | |
1088 | + */ | |
1089 | + if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) && | |
1090 | + li->li_lksb.sb_status != 0) { | |
1091 | + gd_lkb_t *lkb; | |
1092 | + | |
1093 | + /* Wait till dlm_lock() has finished */ | |
1094 | + down(&li->li_firstlock); | |
1095 | + lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid); | |
1096 | + if (lkb) { | |
1097 | + spin_lock(&li->li_file->fi_lkb_lock); | |
1098 | + list_del(&lkb->lkb_ownerqueue); | |
1099 | + spin_unlock(&li->li_file->fi_lkb_lock); | |
1100 | + } | |
1101 | + up(&li->li_firstlock); | |
1102 | + put_file_info(li->li_file); | |
1103 | + kfree(li); | |
1104 | + return; | |
1105 | + } | |
1106 | + /* Free unlocks & queries */ | |
1107 | + if (li->li_lksb.sb_status == -DLM_EUNLOCK || | |
1108 | + li->li_cmd == DLM_USER_QUERY) { | |
1109 | + put_file_info(li->li_file); | |
1110 | + kfree(li); | |
1111 | + } | |
1112 | + } | |
1113 | + else { | |
1114 | + /* Syncronous request, just wake up the caller */ | |
1115 | + set_bit(LI_FLAG_COMPLETE, &li->li_flags); | |
1116 | + wake_up_interruptible(&li->li_waitq); | |
1117 | + } | |
1118 | +} | |
1119 | + | |
1120 | +/* | |
1121 | + * Wait for the lock op to complete and return the status. | |
1122 | + */ | |
1123 | +static int wait_for_ast(struct lock_info *li) | |
1124 | +{ | |
1125 | + /* Wait for the AST routine to complete */ | |
1126 | + set_task_state(current, TASK_INTERRUPTIBLE); | |
1127 | + while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags)) | |
1128 | + schedule(); | |
1129 | + | |
1130 | + set_task_state(current, TASK_RUNNING); | |
1131 | + | |
1132 | + return li->li_lksb.sb_status; | |
1133 | +} | |
1134 | + | |
1135 | + | |
1136 | +/* Open on control device */ | |
1137 | +static int dlm_ctl_open(struct inode *inode, struct file *file) | |
1138 | +{ | |
1139 | + return 0; | |
1140 | +} | |
1141 | + | |
1142 | +/* Close on control device */ | |
1143 | +static int dlm_ctl_close(struct inode *inode, struct file *file) | |
1144 | +{ | |
1145 | + return 0; | |
1146 | +} | |
1147 | + | |
1148 | +/* Open on lockspace device */ | |
1149 | +static int dlm_open(struct inode *inode, struct file *file) | |
1150 | +{ | |
1151 | + struct file_info *f; | |
1152 | + struct user_ls *lsinfo; | |
1153 | + | |
1154 | + lsinfo = find_lockspace(iminor(inode)); | |
1155 | + if (!lsinfo) | |
1156 | + return -ENOENT; | |
1157 | + | |
1158 | + f = kmalloc(sizeof(struct file_info), GFP_KERNEL); | |
1159 | + if (!f) | |
1160 | + return -ENOMEM; | |
1161 | + | |
1162 | + atomic_inc(&lsinfo->ls_refcnt); | |
1163 | + INIT_LIST_HEAD(&f->fi_lkb_list); | |
1164 | + INIT_LIST_HEAD(&f->fi_ast_list); | |
1165 | + spin_lock_init(&f->fi_ast_lock); | |
1166 | + spin_lock_init(&f->fi_lkb_lock); | |
1167 | + init_waitqueue_head(&f->fi_wait); | |
1168 | + f->fi_ls = lsinfo; | |
1169 | + atomic_set(&f->fi_refcnt, 1); | |
1170 | + set_bit(1, &f->fi_flags); | |
1171 | + | |
1172 | + file->private_data = f; | |
1173 | + | |
1174 | + return 0; | |
1175 | +} | |
1176 | + | |
1177 | +/* Check the user's version matches ours */ | |
1178 | +static int check_version(struct dlm_lock_params *params) | |
1179 | +{ | |
1180 | + if (params->version[0] != DLM_DEVICE_VERSION_MAJOR || | |
1181 | + (params->version[0] == DLM_DEVICE_VERSION_MAJOR && | |
1182 | + params->version[1] > DLM_DEVICE_VERSION_MINOR)) { | |
1183 | + | |
1184 | + log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)", | |
1185 | + params->version[0], | |
1186 | + params->version[1], | |
1187 | + params->version[2], | |
1188 | + DLM_DEVICE_VERSION_MAJOR, | |
1189 | + DLM_DEVICE_VERSION_MINOR, | |
1190 | + DLM_DEVICE_VERSION_PATCH); | |
1191 | + return -EINVAL; | |
1192 | + } | |
1193 | + return 0; | |
1194 | +} | |
1195 | + | |
1196 | +/* Close on lockspace device */ | |
1197 | +static int dlm_close(struct inode *inode, struct file *file) | |
1198 | +{ | |
1199 | + struct file_info *f = file->private_data; | |
1200 | + struct lock_info li; | |
1201 | + sigset_t tmpsig; | |
1202 | + sigset_t allsigs; | |
1203 | + gd_lkb_t *lkb, *safe; | |
1204 | + struct user_ls *lsinfo; | |
1205 | + DECLARE_WAITQUEUE(wq, current); | |
1206 | + | |
1207 | + lsinfo = find_lockspace(iminor(inode)); | |
1208 | + if (!lsinfo) | |
1209 | + return -ENOENT; | |
1210 | + | |
1211 | + /* Mark this closed so that ASTs will not be delivered any more */ | |
1212 | + clear_bit(1, &f->fi_flags); | |
1213 | + | |
1214 | + /* Block signals while we are doing this */ | |
1215 | + sigfillset(&allsigs); | |
1216 | + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); | |
1217 | + | |
1218 | + /* We use our own lock_info struct here, so that any | |
1219 | + * outstanding "real" ASTs will be delivered with the | |
1220 | + * corresponding "real" params, thus freeing the lock_info | |
1221 | + * that belongs the lock. This catches the corner case where | |
1222 | + * a lock is BUSY when we try to unlock it here | |
1223 | + */ | |
1224 | + memset(&li, 0, sizeof(li)); | |
1225 | + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); | |
1226 | + init_waitqueue_head(&li.li_waitq); | |
1227 | + add_wait_queue(&li.li_waitq, &wq); | |
1228 | + | |
1229 | + /* | |
1230 | + * Free any outstanding locks, they are on the | |
1231 | + * list in LIFO order so there should be no problems | |
1232 | + * about unlocking parents before children. | |
1233 | + * Although we don't remove the lkbs from the list here | |
1234 | + * (what would be the point?), foreach_safe is needed | |
1235 | + * because the lkbs are freed during dlm_unlock operations | |
1236 | + */ | |
1237 | + list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) { | |
1238 | + int status; | |
1239 | + int lock_status; | |
1240 | + int flags = 0; | |
1241 | + struct lock_info *old_li; | |
1242 | + | |
1243 | + /* Make a copy of this pointer. If all goes well we will | |
1244 | + * free it later. if not it will be left to the AST routine | |
1245 | + * to tidy up | |
1246 | + */ | |
1247 | + old_li = (struct lock_info *)lkb->lkb_astparam; | |
1248 | + | |
1249 | + /* Don't unlock persistent locks */ | |
1250 | + if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) { | |
1251 | + list_del(&lkb->lkb_ownerqueue); | |
1252 | + | |
1253 | + /* But tidy our references in it */ | |
1254 | + kfree(old_li); | |
1255 | + lkb->lkb_astparam = (long)NULL; | |
1256 | + put_file_info(f); | |
1257 | + continue; | |
1258 | + } | |
1259 | + | |
1260 | + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); | |
1261 | + | |
1262 | + /* If it's not granted then cancel the request. | |
1263 | + * If the lock was WAITING then it will be dropped, | |
1264 | + * if it was converting then it will be reverted to GRANTED, | |
1265 | + * then we will unlock it. | |
1266 | + */ | |
1267 | + lock_status = lkb->lkb_status; | |
1268 | + | |
1269 | + if (lock_status != GDLM_LKSTS_GRANTED) | |
1270 | + flags = DLM_LKF_CANCEL; | |
1271 | + | |
1272 | + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li); | |
1273 | + | |
1274 | + /* Must wait for it to complete as the next lock could be its | |
1275 | + * parent */ | |
1276 | + if (status == 0) | |
1277 | + wait_for_ast(&li); | |
1278 | + | |
1279 | + /* If it was waiting for a conversion, it will | |
1280 | + now be granted so we can unlock it properly */ | |
1281 | + if (lock_status == GDLM_LKSTS_CONVERT) { | |
1282 | + | |
1283 | + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); | |
1284 | + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li); | |
1285 | + | |
1286 | + if (status == 0) | |
1287 | + wait_for_ast(&li); | |
1288 | + } | |
1289 | + /* Unlock suceeded, free the lock_info struct. */ | |
1290 | + if (status == 0) { | |
1291 | + kfree(old_li); | |
1292 | + put_file_info(f); | |
1293 | + } | |
1294 | + } | |
1295 | + | |
1296 | + remove_wait_queue(&li.li_waitq, &wq); | |
1297 | + | |
1298 | + /* If this is the last reference, and the lockspace has been deleted | |
1299 | + the free the struct */ | |
1300 | + if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) { | |
1301 | + kfree(lsinfo); | |
1302 | + } | |
1303 | + | |
1304 | + /* Restore signals */ | |
1305 | + sigprocmask(SIG_SETMASK, &tmpsig, NULL); | |
1306 | + recalc_sigpending(); | |
1307 | + | |
1308 | + return 0; | |
1309 | +} | |
1310 | + | |
1311 | +/* | |
1312 | + * ioctls to create/remove lockspaces, and check how many | |
1313 | + * outstanding ASTs there are against a particular LS. | |
1314 | + */ | |
1315 | +static int dlm_ioctl(struct inode *inode, struct file *file, | |
1316 | + uint command, ulong u) | |
1317 | +{ | |
1318 | + struct file_info *fi = file->private_data; | |
1319 | + int status = -EINVAL; | |
1320 | + int count; | |
1321 | + struct list_head *tmp_list; | |
1322 | + | |
1323 | + switch (command) { | |
1324 | + | |
1325 | + /* Are there any ASTs for us to read? | |
1326 | + * Warning, this returns the number of messages (ASTs) | |
1327 | + * in the queue, NOT the number of bytes to read | |
1328 | + */ | |
1329 | + case FIONREAD: | |
1330 | + count = 0; | |
1331 | + spin_lock(&fi->fi_ast_lock); | |
1332 | + list_for_each(tmp_list, &fi->fi_ast_list) | |
1333 | + count++; | |
1334 | + spin_unlock(&fi->fi_ast_lock); | |
1335 | + status = put_user(count, (int *)u); | |
1336 | + break; | |
1337 | + | |
1338 | + default: | |
1339 | + return -ENOTTY; | |
1340 | + } | |
1341 | + | |
1342 | + return status; | |
1343 | +} | |
1344 | + | |
1345 | +/* | |
1346 | + * ioctls to create/remove lockspaces. | |
1347 | + */ | |
1348 | +static int dlm_ctl_ioctl(struct inode *inode, struct file *file, | |
1349 | + uint command, ulong u) | |
1350 | +{ | |
1351 | + int status = -EINVAL; | |
1352 | + char ls_name[MAX_LS_NAME_LEN]; | |
1353 | + struct user_ls *lsinfo; | |
1354 | + int force = 0; | |
1355 | + | |
1356 | + switch (command) { | |
1357 | + case DLM_CREATE_LOCKSPACE: | |
1358 | + if (!capable(CAP_SYS_ADMIN)) | |
1359 | + return -EPERM; | |
1360 | + | |
1361 | + if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0) | |
1362 | + return -EFAULT; | |
1363 | + status = register_lockspace(ls_name, &lsinfo); | |
1364 | + | |
1365 | + /* If it succeeded then return the minor number */ | |
1366 | + if (status == 0) | |
1367 | + status = lsinfo->ls_miscinfo.minor; | |
1368 | + break; | |
1369 | + | |
1370 | + case DLM_FORCE_RELEASE_LOCKSPACE: | |
1371 | + force = 2; | |
1372 | + | |
1373 | + case DLM_RELEASE_LOCKSPACE: | |
1374 | + if (!capable(CAP_SYS_ADMIN)) | |
1375 | + return -EPERM; | |
1376 | + | |
1377 | + lsinfo = find_lockspace(u); | |
1378 | + if (!lsinfo) | |
1379 | + return -EINVAL; | |
1380 | + status = unregister_lockspace(lsinfo, force); | |
1381 | + break; | |
1382 | + | |
1383 | + default: | |
1384 | + return -ENOTTY; | |
1385 | + } | |
1386 | + | |
1387 | + return status; | |
1388 | +} | |
1389 | + | |
1390 | +/* Deal with the messy stuff of copying a web of structs | |
1391 | + from kernel space to userspace */ | |
1392 | +static int copy_query_result(struct ast_info *ast) | |
1393 | +{ | |
1394 | + int status = -EFAULT; | |
1395 | + struct dlm_queryinfo qi; | |
1396 | + | |
1397 | + /* Get the pointers to userspace structs */ | |
1398 | + if (copy_from_user(&qi, ast->user_queryinfo, | |
1399 | + sizeof(struct dlm_queryinfo))) | |
1400 | + goto copy_out; | |
1401 | + | |
1402 | + /* TODO: does this deref a user pointer? */ | |
1403 | + if (put_user(ast->queryinfo->gqi_lockcount, | |
1404 | + &ast->user_queryinfo->gqi_lockcount)) | |
1405 | + goto copy_out; | |
1406 | + | |
1407 | + if (qi.gqi_resinfo) { | |
1408 | + if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo, | |
1409 | + sizeof(struct dlm_resinfo))) | |
1410 | + goto copy_out; | |
1411 | + } | |
1412 | + | |
1413 | + if (qi.gqi_lockinfo) { | |
1414 | + if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo, | |
1415 | + sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount)) | |
1416 | + goto copy_out; | |
1417 | + } | |
1418 | + | |
1419 | + status = 0; | |
1420 | + | |
1421 | + if (ast->queryinfo->gqi_lockinfo) | |
1422 | + kfree(ast->queryinfo->gqi_lockinfo); | |
1423 | + | |
1424 | + if (ast->queryinfo->gqi_resinfo) | |
1425 | + kfree(ast->queryinfo->gqi_resinfo); | |
1426 | + | |
1427 | + kfree(ast->queryinfo); | |
1428 | + | |
1429 | + copy_out: | |
1430 | + return status; | |
1431 | +} | |
1432 | + | |
1433 | +/* Read call, might block if no ASTs are waiting. | |
1434 | + * It will only ever return one message at a time, regardless | |
1435 | + * of how many are pending. | |
1436 | + */ | |
1437 | +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) | |
1438 | +{ | |
1439 | + struct file_info *fi = file->private_data; | |
1440 | + struct ast_info *ast; | |
1441 | + int ret; | |
1442 | + DECLARE_WAITQUEUE(wait, current); | |
1443 | + | |
1444 | + if (count < sizeof(struct dlm_lock_result)) | |
1445 | + return -EINVAL; | |
1446 | + | |
1447 | + spin_lock(&fi->fi_ast_lock); | |
1448 | + if (list_empty(&fi->fi_ast_list)) { | |
1449 | + | |
1450 | + /* No waiting ASTs. | |
1451 | + * Return EOF if the lockspace been deleted. | |
1452 | + */ | |
1453 | + if (test_bit(1, &fi->fi_ls->ls_flags)) | |
1454 | + return 0; | |
1455 | + | |
1456 | + if (file->f_flags & O_NONBLOCK) { | |
1457 | + spin_unlock(&fi->fi_ast_lock); | |
1458 | + return -EAGAIN; | |
1459 | + } | |
1460 | + | |
1461 | + add_wait_queue(&fi->fi_wait, &wait); | |
1462 | + | |
1463 | + repeat: | |
1464 | + set_current_state(TASK_INTERRUPTIBLE); | |
1465 | + if (list_empty(&fi->fi_ast_list) && | |
1466 | + !signal_pending(current)) { | |
1467 | + | |
1468 | + spin_unlock(&fi->fi_ast_lock); | |
1469 | + schedule(); | |
1470 | + spin_lock(&fi->fi_ast_lock); | |
1471 | + goto repeat; | |
1472 | + } | |
1473 | + | |
1474 | + current->state = TASK_RUNNING; | |
1475 | + remove_wait_queue(&fi->fi_wait, &wait); | |
1476 | + | |
1477 | + if (signal_pending(current)) { | |
1478 | + spin_unlock(&fi->fi_ast_lock); | |
1479 | + return -ERESTARTSYS; | |
1480 | + } | |
1481 | + } | |
1482 | + | |
1483 | + ast = list_entry(fi->fi_ast_list.next, struct ast_info, list); | |
1484 | + list_del(&ast->list); | |
1485 | + spin_unlock(&fi->fi_ast_lock); | |
1486 | + | |
1487 | + ret = sizeof(struct dlm_lock_result); | |
1488 | + if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result))) | |
1489 | + ret = -EFAULT; | |
1490 | + | |
1491 | + /* If it was a query then copy the result block back here */ | |
1492 | + if (ast->queryinfo) { | |
1493 | + int status = copy_query_result(ast); | |
1494 | + if (status) | |
1495 | + ret = status; | |
1496 | + } | |
1497 | + | |
1498 | + kfree(ast); | |
1499 | + return ret; | |
1500 | +} | |
1501 | + | |
1502 | +static unsigned int dlm_poll(struct file *file, poll_table *wait) | |
1503 | +{ | |
1504 | + struct file_info *fi = file->private_data; | |
1505 | + | |
1506 | + poll_wait(file, &fi->fi_wait, wait); | |
1507 | + | |
1508 | + spin_lock(&fi->fi_ast_lock); | |
1509 | + if (!list_empty(&fi->fi_ast_list)) { | |
1510 | + spin_unlock(&fi->fi_ast_lock); | |
1511 | + return POLLIN | POLLRDNORM; | |
1512 | + } | |
1513 | + | |
1514 | + spin_unlock(&fi->fi_ast_lock); | |
1515 | + return 0; | |
1516 | +} | |
1517 | + | |
1518 | +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams) | |
1519 | +{ | |
1520 | + struct lock_info *li; | |
1521 | + int status; | |
1522 | + | |
1523 | + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL); | |
1524 | + if (!li) | |
1525 | + return -ENOMEM; | |
1526 | + | |
1527 | + get_file_info(fi); | |
1528 | + li->li_user_lksb = kparams->lksb; | |
1529 | + li->li_astparam = kparams->astparam; | |
1530 | + li->li_bastaddr = kparams->bastaddr; | |
1531 | + li->li_astaddr = kparams->astaddr; | |
1532 | + li->li_file = fi; | |
1533 | + li->li_flags = 0; | |
1534 | + li->li_cmd = kparams->cmd; | |
1535 | + clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); | |
1536 | + | |
1537 | + if (copy_from_user(&li->li_lksb, kparams->lksb, | |
1538 | + sizeof(struct dlm_lksb))) { | |
1539 | + kfree(li); | |
1540 | + return -EFAULT; | |
1541 | + } | |
1542 | + li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr; | |
1543 | + | |
1544 | + /* Allocate query structs */ | |
1545 | + status = -ENOMEM; | |
1546 | + li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL); | |
1547 | + if (!li->li_queryinfo) | |
1548 | + goto out1; | |
1549 | + | |
1550 | + /* Mainly to get gqi_lock buffer size */ | |
1551 | + if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr, | |
1552 | + sizeof(struct dlm_queryinfo))) { | |
1553 | + status = -EFAULT; | |
1554 | + goto out1; | |
1555 | + } | |
1556 | + | |
1557 | + /* Overwrite userspace pointers we just copied with kernel space ones */ | |
1558 | + if (li->li_queryinfo->gqi_resinfo) { | |
1559 | + li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL); | |
1560 | + if (!li->li_queryinfo->gqi_resinfo) | |
1561 | + goto out1; | |
1562 | + } | |
1563 | + if (li->li_queryinfo->gqi_lockinfo) { | |
1564 | + li->li_queryinfo->gqi_lockinfo = | |
1565 | + kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize, | |
1566 | + GFP_KERNEL); | |
1567 | + if (!li->li_queryinfo->gqi_lockinfo) | |
1568 | + goto out2; | |
1569 | + } | |
1570 | + | |
1571 | + li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo; | |
1572 | + | |
1573 | + return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb, | |
1574 | + kparams->flags, /* query */ | |
1575 | + li->li_queryinfo, | |
1576 | + ast_routine, li); | |
1577 | + | |
1578 | + out2: | |
1579 | + kfree(li->li_queryinfo); | |
1580 | + | |
1581 | + out1: | |
1582 | + kfree(li); | |
1583 | + return status; | |
1584 | +} | |
1585 | + | |
1586 | +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams, | |
1587 | + const char *buffer) | |
1588 | +{ | |
1589 | + struct lock_info *li; | |
1590 | + int status; | |
1591 | + char name[DLM_RESNAME_MAXLEN]; | |
1592 | + | |
1593 | + /* | |
1594 | + * Validate things that we need to have correct. | |
1595 | + */ | |
1596 | + if (kparams->namelen > DLM_RESNAME_MAXLEN) | |
1597 | + return -EINVAL; | |
1598 | + | |
1599 | + if (!kparams->astaddr) | |
1600 | + return -EINVAL; | |
1601 | + | |
1602 | + if (!kparams->lksb) | |
1603 | + return -EINVAL; | |
1604 | + | |
1605 | + /* Get the lock name */ | |
1606 | + if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name), | |
1607 | + kparams->namelen)) { | |
1608 | + return -EFAULT; | |
1609 | + } | |
1610 | + | |
1611 | + /* For conversions, the lock will already have a lock_info | |
1612 | + block squirelled away in astparam */ | |
1613 | + if (kparams->flags & DLM_LKF_CONVERT) { | |
1614 | + gd_lkb_t *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid); | |
1615 | + if (!lkb) { | |
1616 | + return -EINVAL; | |
1617 | + } | |
1618 | + li = (struct lock_info *)lkb->lkb_astparam; | |
1619 | + | |
1620 | + /* Only override these if they are provided */ | |
1621 | + if (li->li_user_lksb) | |
1622 | + li->li_user_lksb = kparams->lksb; | |
1623 | + if (li->li_astparam) | |
1624 | + li->li_astparam = kparams->astparam; | |
1625 | + if (li->li_bastaddr) | |
1626 | + li->li_bastaddr = kparams->bastaddr; | |
1627 | + if (li->li_bastaddr) | |
1628 | + li->li_astaddr = kparams->astaddr; | |
1629 | + li->li_flags = 0; | |
1630 | + } | |
1631 | + else { | |
1632 | + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL); | |
1633 | + if (!li) | |
1634 | + return -ENOMEM; | |
1635 | + | |
1636 | + li->li_user_lksb = kparams->lksb; | |
1637 | + li->li_astparam = kparams->astparam; | |
1638 | + li->li_bastaddr = kparams->bastaddr; | |
1639 | + li->li_astaddr = kparams->astaddr; | |
1640 | + li->li_file = fi; | |
1641 | + li->li_flags = 0; | |
1642 | + li->li_cmd = kparams->cmd; | |
1643 | + li->li_queryinfo = NULL; | |
1644 | + | |
1645 | + /* semaphore to allow us to complete our work before | |
1646 | + the AST routine runs. In fact we only need (and use) this | |
1647 | + when the initial lock fails */ | |
1648 | + init_MUTEX_LOCKED(&li->li_firstlock); | |
1649 | + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); | |
1650 | + | |
1651 | + get_file_info(fi); | |
1652 | + } | |
1653 | + | |
1654 | + /* Copy the user's LKSB into kernel space, | |
1655 | + needed for conversions & value block operations */ | |
1656 | + if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb, | |
1657 | + sizeof(struct dlm_lksb))) | |
1658 | + return -EFAULT; | |
1659 | + | |
1660 | + /* Lock it ... */ | |
1661 | + status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb, | |
1662 | + kparams->flags, name, kparams->namelen, | |
1663 | + kparams->parent, | |
1664 | + ast_routine, | |
1665 | + li, | |
1666 | + li->li_bastaddr ? bast_routine : NULL, | |
1667 | + kparams->range.ra_end ? &kparams->range : NULL); | |
1668 | + | |
1669 | + /* If it succeeded (this far) with a new lock then keep track of | |
1670 | + it on the file's lkb list */ | |
1671 | + if (!status && !(kparams->flags & DLM_LKF_CONVERT)) { | |
1672 | + gd_lkb_t *lkb; | |
1673 | + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid); | |
1674 | + | |
1675 | + if (lkb) { | |
1676 | + spin_lock(&fi->fi_lkb_lock); | |
1677 | + list_add(&lkb->lkb_ownerqueue, | |
1678 | + &fi->fi_lkb_list); | |
1679 | + spin_unlock(&fi->fi_lkb_lock); | |
1680 | + } | |
1681 | + else { | |
1682 | + log_print("failed to get lkb for new lock"); | |
1683 | + } | |
1684 | + up(&li->li_firstlock); | |
1685 | + } | |
1686 | + | |
1687 | + return status; | |
1688 | +} | |
1689 | + | |
1690 | +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams) | |
1691 | +{ | |
1692 | + struct lock_info *li; | |
1693 | + gd_lkb_t *lkb; | |
1694 | + int status; | |
1695 | + | |
1696 | + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid); | |
1697 | + if (!lkb) { | |
1698 | + return -EINVAL; | |
1699 | + } | |
1700 | + | |
1701 | + li = (struct lock_info *)lkb->lkb_astparam; | |
1702 | + | |
1703 | + li->li_user_lksb = kparams->lksb; | |
1704 | + li->li_astparam = kparams->astparam; | |
1705 | + li->li_cmd = kparams->cmd; | |
1706 | + | |
1707 | + /* Have to do it here cos the lkb may not exist after | |
1708 | + * dlm_unlock() */ | |
1709 | + spin_lock(&fi->fi_lkb_lock); | |
1710 | + list_del(&lkb->lkb_ownerqueue); | |
1711 | + spin_unlock(&fi->fi_lkb_lock); | |
1712 | + | |
1713 | + /* Use existing lksb & astparams */ | |
1714 | + status = dlm_unlock(fi->fi_ls->ls_lockspace, | |
1715 | + kparams->lkid, | |
1716 | + kparams->flags, NULL, NULL); | |
1717 | + | |
1718 | + return status; | |
1719 | +} | |
1720 | + | |
1721 | +/* Write call, submit a locking request */ | |
1722 | +static ssize_t dlm_write(struct file *file, const char __user *buffer, | |
1723 | + size_t count, loff_t *ppos) | |
1724 | +{ | |
1725 | + struct file_info *fi = file->private_data; | |
1726 | + struct dlm_lock_params kparams; | |
1727 | + sigset_t tmpsig; | |
1728 | + sigset_t allsigs; | |
1729 | + int status; | |
1730 | + | |
1731 | + if (count < sizeof(kparams)) | |
1732 | + return -EINVAL; | |
1733 | + | |
1734 | + /* Has the lockspace been deleted */ | |
1735 | + if (test_bit(1, &fi->fi_ls->ls_flags)) | |
1736 | + return -ENOENT; | |
1737 | + | |
1738 | + /* Get the command info */ | |
1739 | + if (copy_from_user(&kparams, buffer, sizeof(kparams))) | |
1740 | + return -EFAULT; | |
1741 | + | |
1742 | + if (check_version(&kparams)) | |
1743 | + return -EINVAL; | |
1744 | + | |
1745 | + /* Block signals while we are doing this */ | |
1746 | + sigfillset(&allsigs); | |
1747 | + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); | |
1748 | + | |
1749 | + switch (kparams.cmd) | |
1750 | + { | |
1751 | + case DLM_USER_LOCK: | |
1752 | + status = do_user_lock(fi, &kparams, buffer); | |
1753 | + break; | |
1754 | + | |
1755 | + case DLM_USER_UNLOCK: | |
1756 | + status = do_user_unlock(fi, &kparams); | |
1757 | + break; | |
1758 | + | |
1759 | + case DLM_USER_QUERY: | |
1760 | + status = do_user_query(fi, &kparams); | |
1761 | + break; | |
1762 | + | |
1763 | + default: | |
1764 | + status = -EINVAL; | |
1765 | + break; | |
1766 | + } | |
1767 | + /* Restore signals */ | |
1768 | + sigprocmask(SIG_SETMASK, &tmpsig, NULL); | |
1769 | + recalc_sigpending(); | |
1770 | + | |
1771 | + if (status == 0) | |
1772 | + return count; | |
1773 | + else | |
1774 | + return status; | |
1775 | +} | |
1776 | + | |
1777 | +void dlm_device_free_devices() | |
1778 | +{ | |
1779 | + struct user_ls *tmp; | |
1780 | + struct user_ls *lsinfo; | |
1781 | + | |
1782 | + list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) { | |
1783 | + misc_deregister(&lsinfo->ls_miscinfo); | |
1784 | + | |
1785 | + /* Tidy up, but don't delete the lsinfo struct until | |
1786 | + all the users have closed their devices */ | |
1787 | + list_del(&lsinfo->ls_list); | |
1788 | + kfree(lsinfo->ls_miscinfo.name); | |
1789 | + set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */ | |
1790 | + } | |
1791 | +} | |
1792 | + | |
1793 | +static struct file_operations _dlm_fops = { | |
1794 | + .open = dlm_open, | |
1795 | + .release = dlm_close, | |
1796 | + .ioctl = dlm_ioctl, | |
1797 | + .read = dlm_read, | |
1798 | + .write = dlm_write, | |
1799 | + .poll = dlm_poll, | |
1800 | + .owner = THIS_MODULE, | |
1801 | +}; | |
1802 | + | |
1803 | +static struct file_operations _dlm_ctl_fops = { | |
1804 | + .open = dlm_ctl_open, | |
1805 | + .release = dlm_ctl_close, | |
1806 | + .ioctl = dlm_ctl_ioctl, | |
1807 | + .owner = THIS_MODULE, | |
1808 | +}; | |
1809 | + | |
1810 | +/* | |
1811 | + * Create control device | |
1812 | + */ | |
1813 | +int dlm_device_init(void) | |
1814 | +{ | |
1815 | + int r; | |
1816 | + | |
1817 | + INIT_LIST_HEAD(&user_ls_list); | |
1818 | + | |
1819 | + ctl_device.name = "dlm-control"; | |
1820 | + ctl_device.fops = &_dlm_ctl_fops; | |
1821 | + ctl_device.minor = MISC_DYNAMIC_MINOR; | |
1822 | + | |
1823 | + r = misc_register(&ctl_device); | |
1824 | + if (r) { | |
1825 | + log_print("misc_register failed for DLM control device"); | |
1826 | + return r; | |
1827 | + } | |
1828 | + | |
1829 | + return 0; | |
1830 | +} | |
1831 | + | |
1832 | +void dlm_device_exit(void) | |
1833 | +{ | |
1834 | + misc_deregister(&ctl_device); | |
1835 | +} | |
1836 | + | |
1837 | +/* | |
1838 | + * Overrides for Emacs so that we follow Linus's tabbing style. | |
1839 | + * Emacs will notice this stuff at the end of the file and automatically | |
1840 | + * adjust the settings for this buffer only. This must remain at the end | |
1841 | + * of the file. | |
1842 | + * --------------------------------------------------------------------------- | |
1843 | + * Local variables: | |
1844 | + * c-file-style: "linux" | |
1845 | + * End: | |
1846 | + */ | |
1847 | diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h | |
1848 | --- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 1849 | +++ linux-patched/cluster/dlm/device.h 2004-06-29 20:01:19.000000000 +0800 |
4bf12011 | 1850 | @@ -0,0 +1,19 @@ |
1851 | +/****************************************************************************** | |
1852 | +******************************************************************************* | |
1853 | +** | |
1854 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
1855 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
1856 | +** | |
1857 | +** This copyrighted material is made available to anyone wishing to use, | |
1858 | +** modify, copy, or redistribute it subject to the terms and conditions | |
1859 | +** of the GNU General Public License v.2. | |
1860 | +** | |
1861 | +******************************************************************************* | |
1862 | +******************************************************************************/ | |
1863 | + | |
1864 | +#ifndef __DEVICE_DOT_H__ | |
1865 | +#define __DEVICE_DOT_H__ | |
1866 | + | |
1867 | +extern void dlm_device_free_devices(void); | |
1868 | + | |
1869 | +#endif /* __DEVICE_DOT_H__ */ | |
1870 | diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c | |
1871 | --- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 1872 | +++ linux-patched/cluster/dlm/dir.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 1873 | @@ -0,0 +1,430 @@ |
1874 | +/****************************************************************************** | |
1875 | +******************************************************************************* | |
1876 | +** | |
1877 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
1878 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
1879 | +** | |
1880 | +** This copyrighted material is made available to anyone wishing to use, | |
1881 | +** modify, copy, or redistribute it subject to the terms and conditions | |
1882 | +** of the GNU General Public License v.2. | |
1883 | +** | |
1884 | +******************************************************************************* | |
1885 | +******************************************************************************/ | |
1886 | + | |
1887 | +#include "dlm_internal.h" | |
1888 | +#include "nodes.h" | |
1889 | +#include "lockspace.h" | |
1890 | +#include "lowcomms.h" | |
1891 | +#include "reccomms.h" | |
1892 | +#include "rsb.h" | |
1893 | +#include "config.h" | |
1894 | +#include "memory.h" | |
1895 | +#include "recover.h" | |
1896 | +#include "util.h" | |
1897 | + | |
1898 | +/* | |
1899 | + * We use the upper 16 bits of the hash value to select the directory node. | |
1900 | + * Low bits are used for distribution of rsb's among hash buckets on each node. | |
1901 | + * | |
1902 | + * From the hash value, we are interested in arriving at a final value between | |
1903 | + * zero and the number of nodes minus one (num_nodes - 1). | |
1904 | + * | |
1905 | + * To accomplish this scaling, we take the nearest power of two larger than | |
1906 | + * num_nodes and subtract one to create a bit mask. The mask is applied to the | |
1907 | + * hash, reducing the range to nearer the final range. | |
1908 | + * | |
1909 | + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of | |
1910 | + * num_nodes to the previously masked hash value. | |
1911 | + * | |
1912 | + * This value in the desired range is used as an offset into the sorted list of | |
1913 | + * nodeid's to give the particular nodeid of the directory node. | |
1914 | + */ | |
1915 | + | |
1916 | +uint32_t name_to_directory_nodeid(gd_ls_t *ls, char *name, int length) | |
1917 | +{ | |
1918 | + struct list_head *tmp; | |
1919 | + gd_csb_t *csb = NULL; | |
1920 | + uint32_t hash, node, n = 0, nodeid; | |
1921 | + | |
1922 | + if (ls->ls_num_nodes == 1) { | |
1923 | + nodeid = our_nodeid(); | |
1924 | + goto out; | |
1925 | + } | |
1926 | + | |
1927 | + hash = gdlm_hash(name, length); | |
1928 | + node = (hash >> 16) & ls->ls_nodes_mask; | |
1929 | + node %= ls->ls_num_nodes; | |
1930 | + | |
1931 | + list_for_each(tmp, &ls->ls_nodes) { | |
1932 | + if (n++ != node) | |
1933 | + continue; | |
1934 | + csb = list_entry(tmp, gd_csb_t, csb_list); | |
1935 | + break; | |
1936 | + } | |
1937 | + | |
1938 | + GDLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n", | |
1939 | + ls->ls_num_nodes, n, node, ls->ls_nodes_mask);); | |
1940 | + nodeid = csb->csb_node->gn_nodeid; | |
1941 | + | |
1942 | + out: | |
1943 | + return nodeid; | |
1944 | +} | |
1945 | + | |
1946 | +uint32_t get_directory_nodeid(gd_res_t *rsb) | |
1947 | +{ | |
1948 | + return name_to_directory_nodeid(rsb->res_ls, rsb->res_name, | |
1949 | + rsb->res_length); | |
1950 | +} | |
1951 | + | |
1952 | +static inline uint32_t rd_hash(gd_ls_t *ls, char *name, int len) | |
1953 | +{ | |
1954 | + uint32_t val; | |
1955 | + | |
1956 | + val = gdlm_hash(name, len); | |
1957 | + val &= RESDIRHASH_MASK; | |
1958 | + | |
1959 | + return val; | |
1960 | +} | |
1961 | + | |
1962 | +static void add_resdata_to_hash(gd_ls_t *ls, gd_resdata_t *rd) | |
1963 | +{ | |
1964 | + gd_resdir_bucket_t *bucket; | |
1965 | + uint32_t hashval; | |
1966 | + | |
1967 | + hashval = rd_hash(ls, rd->rd_name, rd->rd_length); | |
1968 | + bucket = &ls->ls_resdir_hash[hashval]; | |
1969 | + | |
1970 | + list_add_tail(&rd->rd_list, &bucket->rb_reslist); | |
1971 | +} | |
1972 | + | |
1973 | +static gd_resdata_t *search_rdbucket(gd_ls_t *ls, char *name, int namelen, | |
1974 | + uint32_t bucket) | |
1975 | +{ | |
1976 | + struct list_head *head; | |
1977 | + gd_resdata_t *rd; | |
1978 | + | |
1979 | + head = &ls->ls_resdir_hash[bucket].rb_reslist; | |
1980 | + list_for_each_entry(rd, head, rd_list) { | |
1981 | + if (rd->rd_length == namelen && | |
1982 | + !memcmp(name, rd->rd_name, namelen)) | |
1983 | + goto out; | |
1984 | + } | |
1985 | + rd = NULL; | |
1986 | + out: | |
1987 | + return rd; | |
1988 | +} | |
1989 | + | |
1990 | +void remove_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen, | |
1991 | + uint8_t sequence) | |
1992 | +{ | |
1993 | + gd_resdata_t *rd; | |
1994 | + uint32_t bucket; | |
1995 | + | |
1996 | + bucket = rd_hash(ls, name, namelen); | |
1997 | + | |
1998 | + write_lock(&ls->ls_resdir_hash[bucket].rb_lock); | |
1999 | + | |
2000 | + rd = search_rdbucket(ls, name, namelen, bucket); | |
2001 | + | |
2002 | + if (!rd) { | |
2003 | + log_debug(ls, "remove_resdata not found nodeid=%u", nodeid); | |
2004 | + goto out; | |
2005 | + } | |
2006 | + | |
2007 | + if (rd->rd_master_nodeid != nodeid) { | |
2008 | + log_debug(ls, "remove_resdata wrong nodeid=%u", nodeid); | |
2009 | + goto out; | |
2010 | + } | |
2011 | + | |
2012 | + if (rd->rd_sequence == sequence) { | |
2013 | + list_del(&rd->rd_list); | |
2014 | + free_resdata(rd); | |
2015 | + } else { | |
2016 | + /* | |
2017 | + log_debug(ls, "remove_resdata mismatch nodeid=%u rd=%u in=%u", | |
2018 | + nodeid, rd->rd_sequence, sequence); | |
2019 | + */ | |
2020 | + } | |
2021 | + | |
2022 | + out: | |
2023 | + write_unlock(&ls->ls_resdir_hash[bucket].rb_lock); | |
2024 | +} | |
2025 | + | |
2026 | +void resdir_clear(gd_ls_t *ls) | |
2027 | +{ | |
2028 | + struct list_head *head; | |
2029 | + gd_resdata_t *rd; | |
2030 | + int i; | |
2031 | + | |
2032 | + for (i = 0; i < RESDIRHASH_SIZE; i++) { | |
2033 | + head = &ls->ls_resdir_hash[i].rb_reslist; | |
2034 | + while (!list_empty(head)) { | |
2035 | + rd = list_entry(head->next, gd_resdata_t, rd_list); | |
2036 | + list_del(&rd->rd_list); | |
2037 | + free_resdata(rd); | |
2038 | + } | |
2039 | + } | |
2040 | +} | |
2041 | + | |
2042 | +static void gdlm_resmov_in(gd_resmov_t *rm, char *buf) | |
2043 | +{ | |
2044 | + gd_resmov_t tmp; | |
2045 | + | |
2046 | + memcpy(&tmp, buf, sizeof(gd_resmov_t)); | |
2047 | + | |
2048 | + rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid); | |
2049 | + rm->rm_length = be16_to_cpu(tmp.rm_length); | |
2050 | +} | |
2051 | + | |
2052 | +int resdir_rebuild_local(gd_ls_t *ls) | |
2053 | +{ | |
2054 | + gd_csb_t *csb; | |
2055 | + gd_resdata_t *rd; | |
2056 | + gd_rcom_t *rc; | |
2057 | + gd_resmov_t mov, last_mov; | |
2058 | + char *b, *last_name; | |
2059 | + int error = -ENOMEM, count = 0; | |
2060 | + | |
2061 | + log_all(ls, "rebuild resource directory"); | |
2062 | + | |
2063 | + resdir_clear(ls); | |
2064 | + | |
2065 | + rc = allocate_rcom_buffer(ls); | |
2066 | + if (!rc) | |
2067 | + goto out; | |
2068 | + | |
2069 | + last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL); | |
2070 | + if (!last_name) | |
2071 | + goto free_rc; | |
2072 | + | |
2073 | + list_for_each_entry(csb, &ls->ls_nodes, csb_list) { | |
2074 | + last_mov.rm_length = 0; | |
2075 | + for (;;) { | |
2076 | + error = gdlm_recovery_stopped(ls); | |
2077 | + if (error) | |
2078 | + goto free_last; | |
2079 | + | |
2080 | + memcpy(rc->rc_buf, last_name, last_mov.rm_length); | |
2081 | + rc->rc_datalen = last_mov.rm_length; | |
2082 | + | |
2083 | + error = rcom_send_message(ls, csb->csb_node->gn_nodeid, | |
2084 | + RECCOMM_RECOVERNAMES, rc, 1); | |
2085 | + if (error) | |
2086 | + goto free_last; | |
2087 | + | |
2088 | + schedule(); | |
2089 | + | |
2090 | + /* | |
2091 | + * pick each res out of buffer | |
2092 | + */ | |
2093 | + | |
2094 | + b = rc->rc_buf; | |
2095 | + | |
2096 | + for (;;) { | |
2097 | + gdlm_resmov_in(&mov, b); | |
2098 | + b += sizeof(gd_resmov_t); | |
2099 | + | |
2100 | + /* Length of 0 with a non-zero nodeid marks the | |
2101 | + * end of the list */ | |
2102 | + if (!mov.rm_length && mov.rm_nodeid) | |
2103 | + goto done; | |
2104 | + | |
2105 | + /* This is just the end of the block */ | |
2106 | + if (!mov.rm_length) | |
2107 | + break; | |
2108 | + | |
2109 | + error = -ENOMEM; | |
2110 | + rd = allocate_resdata(ls, mov.rm_length); | |
2111 | + if (!rd) | |
2112 | + goto free_last; | |
2113 | + | |
2114 | + rd->rd_master_nodeid = mov.rm_nodeid; | |
2115 | + rd->rd_length = mov.rm_length; | |
2116 | + rd->rd_sequence = 1; | |
2117 | + | |
2118 | + memcpy(rd->rd_name, b, mov.rm_length); | |
2119 | + b += mov.rm_length; | |
2120 | + | |
2121 | + add_resdata_to_hash(ls, rd); | |
2122 | + count++; | |
2123 | + | |
2124 | + last_mov = mov; | |
2125 | + memset(last_name, 0, DLM_RESNAME_MAXLEN); | |
2126 | + memcpy(last_name, rd->rd_name, rd->rd_length); | |
2127 | + } | |
2128 | + } | |
2129 | + done: | |
2130 | + ; | |
2131 | + } | |
2132 | + | |
2133 | + set_bit(LSFL_RESDIR_VALID, &ls->ls_flags); | |
2134 | + error = 0; | |
2135 | + | |
2136 | + log_all(ls, "rebuilt %d resources", count); | |
2137 | + | |
2138 | + free_last: | |
2139 | + kfree(last_name); | |
2140 | + | |
2141 | + free_rc: | |
2142 | + free_rcom_buffer(rc); | |
2143 | + | |
2144 | + out: | |
2145 | + return error; | |
2146 | +} | |
2147 | + | |
2148 | +/* | |
2149 | + * The reply end of resdir_rebuild_local/RECOVERNAMES. Collect and send as | |
2150 | + * many resource names as can fit in the buffer. | |
2151 | + */ | |
2152 | + | |
2153 | +int resdir_rebuild_send(gd_ls_t *ls, char *inbuf, int inlen, char *outbuf, | |
2154 | + int outlen, uint32_t nodeid) | |
2155 | +{ | |
2156 | + struct list_head *list; | |
2157 | + gd_res_t *start_rsb = NULL, *rsb; | |
2158 | + int offset = 0, start_namelen, error; | |
2159 | + char *start_name; | |
2160 | + gd_resmov_t tmp; | |
2161 | + uint32_t dir_nodeid; | |
2162 | + | |
2163 | + /* | |
2164 | + * Find the rsb where we left off (or start again) | |
2165 | + */ | |
2166 | + | |
2167 | + start_namelen = inlen; | |
2168 | + start_name = inbuf; | |
2169 | + | |
2170 | + if (start_namelen > 1) { | |
2171 | + error = find_or_create_rsb(ls, NULL, start_name, | |
2172 | + start_namelen, 0, &start_rsb); | |
2173 | + GDLM_ASSERT(!error && start_rsb, printk("error %d\n", error);); | |
2174 | + release_rsb(start_rsb); | |
2175 | + } | |
2176 | + | |
2177 | + /* | |
2178 | + * Send rsb names for rsb's we're master of and whose directory node | |
2179 | + * matches the requesting node. | |
2180 | + */ | |
2181 | + | |
2182 | + down_read(&ls->ls_rec_rsblist); | |
2183 | + if (start_rsb) | |
2184 | + list = start_rsb->res_rootlist.next; | |
2185 | + else | |
2186 | + list = ls->ls_rootres.next; | |
2187 | + | |
2188 | + for (offset = 0; list != &ls->ls_rootres; list = list->next) { | |
2189 | + rsb = list_entry(list, gd_res_t, res_rootlist); | |
2190 | + if (rsb->res_nodeid) | |
2191 | + continue; | |
2192 | + | |
2193 | + dir_nodeid = get_directory_nodeid(rsb); | |
2194 | + if (dir_nodeid != nodeid) | |
2195 | + continue; | |
2196 | + | |
2197 | + if (offset + sizeof(gd_resmov_t)*2 + rsb->res_length > outlen) { | |
2198 | + /* Write end-of-block record */ | |
2199 | + memset(&tmp, 0, sizeof(gd_resmov_t)); | |
2200 | + memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t)); | |
2201 | + offset += sizeof(gd_resmov_t); | |
2202 | + goto out; | |
2203 | + } | |
2204 | + | |
2205 | + memset(&tmp, 0, sizeof(gd_resmov_t)); | |
2206 | + tmp.rm_nodeid = cpu_to_be32(our_nodeid()); | |
2207 | + tmp.rm_length = cpu_to_be16(rsb->res_length); | |
2208 | + | |
2209 | + memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t)); | |
2210 | + offset += sizeof(gd_resmov_t); | |
2211 | + | |
2212 | + memcpy(outbuf + offset, rsb->res_name, rsb->res_length); | |
2213 | + offset += rsb->res_length; | |
2214 | + } | |
2215 | + | |
2216 | + /* | |
2217 | + * If we've reached the end of the list (and there's room) write a | |
2218 | + * terminating record. | |
2219 | + */ | |
2220 | + | |
2221 | + if ((list == &ls->ls_rootres) && | |
2222 | + (offset + sizeof(gd_resmov_t) <= outlen)) { | |
2223 | + | |
2224 | + memset(&tmp, 0, sizeof(gd_resmov_t)); | |
2225 | + /* This only needs to be non-zero */ | |
2226 | + tmp.rm_nodeid = cpu_to_be32(1); | |
2227 | + /* and this must be zero */ | |
2228 | + tmp.rm_length = 0; | |
2229 | + memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t)); | |
2230 | + offset += sizeof(gd_resmov_t); | |
2231 | + } | |
2232 | + | |
2233 | + out: | |
2234 | + up_read(&ls->ls_rec_rsblist); | |
2235 | + return offset; | |
2236 | +} | |
2237 | + | |
2238 | +int get_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen, | |
2239 | + gd_resdata_t **rdp, int recovery) | |
2240 | +{ | |
2241 | + gd_resdata_t *rd; | |
2242 | + gd_resdata_t *tmp; | |
2243 | + uint32_t bucket; | |
2244 | + | |
2245 | + bucket = rd_hash(ls, name, namelen); | |
2246 | + | |
2247 | + read_lock(&ls->ls_resdir_hash[bucket].rb_lock); | |
2248 | + rd = search_rdbucket(ls, name, namelen, bucket); | |
2249 | + read_unlock(&ls->ls_resdir_hash[bucket].rb_lock); | |
2250 | + | |
2251 | + if (rd) | |
2252 | + goto out; | |
2253 | + | |
2254 | + rd = allocate_resdata(ls, namelen); | |
2255 | + if (!rd) | |
2256 | + return -ENOMEM; | |
2257 | + | |
2258 | + rd->rd_master_nodeid = nodeid; | |
2259 | + rd->rd_length = namelen; | |
2260 | + memcpy(rd->rd_name, name, namelen); | |
2261 | + | |
2262 | + write_lock(&ls->ls_resdir_hash[bucket].rb_lock); | |
2263 | + tmp = search_rdbucket(ls, name, namelen, bucket); | |
2264 | + if (!tmp) | |
2265 | + list_add_tail(&rd->rd_list, | |
2266 | + &ls->ls_resdir_hash[bucket].rb_reslist); | |
2267 | + write_unlock(&ls->ls_resdir_hash[bucket].rb_lock); | |
2268 | + | |
2269 | + if (tmp) { | |
2270 | + free_resdata(rd); | |
2271 | + rd = tmp; | |
2272 | + } | |
2273 | + | |
2274 | + out: | |
2275 | + *rdp = rd; | |
2276 | + | |
2277 | + if (!recovery) { | |
2278 | + if (++rd->rd_sequence == 0) | |
2279 | + rd->rd_sequence++; | |
2280 | + } else | |
2281 | + rd->rd_sequence = 1; | |
2282 | + | |
2283 | + return 0; | |
2284 | +} | |
2285 | + | |
2286 | +/* | |
2287 | + * The node with lowest id queries all nodes to determine when all are done. | |
2288 | + * All other nodes query the low nodeid for this. | |
2289 | + */ | |
2290 | + | |
2291 | +int resdir_rebuild_wait(gd_ls_t *ls) | |
2292 | +{ | |
2293 | + int error; | |
2294 | + | |
2295 | + if (ls->ls_low_nodeid == our_nodeid()) { | |
2296 | + error = gdlm_wait_status_all(ls, RESDIR_VALID); | |
2297 | + if (!error) | |
2298 | + set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags); | |
2299 | + } else | |
2300 | + error = gdlm_wait_status_low(ls, RESDIR_ALL_VALID); | |
2301 | + | |
2302 | + return error; | |
2303 | +} | |
2304 | diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h | |
2305 | --- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 2306 | +++ linux-patched/cluster/dlm/dir.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 2307 | @@ -0,0 +1,30 @@ |
2308 | +/****************************************************************************** | |
2309 | +******************************************************************************* | |
2310 | +** | |
2311 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
2312 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
2313 | +** | |
2314 | +** This copyrighted material is made available to anyone wishing to use, | |
2315 | +** modify, copy, or redistribute it subject to the terms and conditions | |
2316 | +** of the GNU General Public License v.2. | |
2317 | +** | |
2318 | +******************************************************************************* | |
2319 | +******************************************************************************/ | |
2320 | + | |
2321 | +#ifndef __DIR_DOT_H__ | |
2322 | +#define __DIR_DOT_H__ | |
2323 | + | |
2324 | +uint32_t name_to_directory_nodeid(gd_ls_t * ls, char *name, int length); | |
2325 | +uint32_t get_directory_nodeid(gd_res_t * rsb); | |
2326 | +void remove_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen, | |
2327 | + uint8_t sequence); | |
2328 | +int resdir_rebuild_local(gd_ls_t * ls); | |
2329 | +int resdir_rebuild_send(gd_ls_t * ls, char *inbuf, int inlen, char *outbuf, | |
2330 | + int outlen, uint32_t nodeid); | |
2331 | +int get_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen, | |
2332 | + gd_resdata_t ** rdp, int recovery); | |
2333 | +int resdir_rebuild_wait(gd_ls_t * ls); | |
2334 | +void resdir_clear(gd_ls_t * ls); | |
2335 | +void resdir_dump(gd_ls_t * ls); | |
2336 | + | |
2337 | +#endif /* __DIR_DOT_H__ */ | |
2338 | diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h | |
2339 | --- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b AM |
2340 | +++ linux-patched/cluster/dlm/dlm_internal.h 2004-06-29 20:01:20.000000000 +0800 |
2341 | @@ -0,0 +1,626 @@ | |
4bf12011 | 2342 | +/****************************************************************************** |
2343 | +******************************************************************************* | |
2344 | +** | |
2345 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
2346 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
2347 | +** | |
2348 | +** This copyrighted material is made available to anyone wishing to use, | |
2349 | +** modify, copy, or redistribute it subject to the terms and conditions | |
2350 | +** of the GNU General Public License v.2. | |
2351 | +** | |
2352 | +******************************************************************************* | |
2353 | +******************************************************************************/ | |
2354 | + | |
2355 | +#ifndef __DLM_INTERNAL_DOT_H__ | |
2356 | +#define __DLM_INTERNAL_DOT_H__ | |
2357 | + | |
2358 | +/* | |
2359 | + * This is the main header file to be included in each DLM source file. | |
2360 | + */ | |
2361 | + | |
2362 | +#define DLM_RELEASE_NAME "<CVS>" | |
2363 | + | |
2364 | +#include <linux/slab.h> | |
2365 | +#include <linux/sched.h> | |
2366 | +#include <asm/semaphore.h> | |
2367 | +#include <linux/types.h> | |
2368 | +#include <linux/spinlock.h> | |
2369 | +#include <linux/vmalloc.h> | |
2370 | +#include <asm/uaccess.h> | |
2371 | +#include <linux/list.h> | |
2372 | +#include <linux/errno.h> | |
2373 | +#include <linux/random.h> | |
2374 | + | |
2375 | +#include <cluster/dlm.h> | |
2376 | +#include <cluster/dlm_device.h> | |
2377 | +#include <cluster/service.h> | |
2378 | + | |
2379 | +#ifndef TRUE | |
2380 | +#define TRUE (1) | |
2381 | +#endif | |
2382 | + | |
2383 | +#ifndef FALSE | |
2384 | +#define FALSE (0) | |
2385 | +#endif | |
2386 | + | |
2387 | +#if (BITS_PER_LONG == 64) | |
2388 | +#define PRIu64 "lu" | |
2389 | +#define PRId64 "ld" | |
2390 | +#define PRIo64 "lo" | |
2391 | +#define PRIx64 "lx" | |
2392 | +#define PRIX64 "lX" | |
2393 | +#define SCNu64 "lu" | |
2394 | +#define SCNd64 "ld" | |
2395 | +#define SCNo64 "lo" | |
2396 | +#define SCNx64 "lx" | |
2397 | +#define SCNX64 "lX" | |
2398 | +#else | |
2399 | +#define PRIu64 "Lu" | |
2400 | +#define PRId64 "Ld" | |
2401 | +#define PRIo64 "Lo" | |
2402 | +#define PRIx64 "Lx" | |
2403 | +#define PRIX64 "LX" | |
2404 | +#define SCNu64 "Lu" | |
2405 | +#define SCNd64 "Ld" | |
2406 | +#define SCNo64 "Lo" | |
2407 | +#define SCNx64 "Lx" | |
2408 | +#define SCNX64 "LX" | |
2409 | +#endif | |
2410 | + | |
2411 | +#define wchan_cond_sleep_intr(chan, sleep_cond) \ | |
2412 | +do \ | |
2413 | +{ \ | |
2414 | + DECLARE_WAITQUEUE(__wait_chan, current); \ | |
2415 | + current->state = TASK_INTERRUPTIBLE; \ | |
2416 | + add_wait_queue(&chan, &__wait_chan); \ | |
2417 | + if ((sleep_cond)) \ | |
2418 | + schedule(); \ | |
2419 | + remove_wait_queue(&chan, &__wait_chan); \ | |
2420 | + current->state = TASK_RUNNING; \ | |
2421 | +} \ | |
2422 | +while (0) | |
2423 | + | |
2424 | +static inline int check_timeout(unsigned long stamp, unsigned int seconds) | |
2425 | +{ | |
2426 | + return time_after(jiffies, stamp + seconds * HZ); | |
2427 | +} | |
2428 | + | |
2429 | + | |
2430 | +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args) | |
2431 | + | |
2432 | +#define log_all(ls, fmt, args...) \ | |
2433 | + do { \ | |
2434 | + printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \ | |
2435 | + dlm_debug_log(ls, fmt, ##args); \ | |
2436 | + } while (0) | |
2437 | + | |
2438 | +#define log_error log_all | |
2439 | + | |
2440 | + | |
2441 | +#define DLM_DEBUG | |
2442 | +#if defined(DLM_DEBUG) | |
2443 | +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args) | |
2444 | +#else | |
2445 | +#define log_debug(ls, fmt, args...) | |
2446 | +#endif | |
2447 | + | |
2448 | +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL) | |
2449 | +#undef log_debug | |
2450 | +#define log_debug log_all | |
2451 | +#endif | |
2452 | + | |
2453 | + | |
2454 | +#define GDLM_ASSERT(x, do) \ | |
2455 | +{ \ | |
2456 | + if (!(x)) \ | |
2457 | + { \ | |
2458 | + dlm_debug_dump(); \ | |
2459 | + printk("\nDLM: Assertion failed on line %d of file %s\n" \ | |
2460 | + "DLM: assertion: \"%s\"\n" \ | |
2461 | + "DLM: time = %lu\n", \ | |
2462 | + __LINE__, __FILE__, #x, jiffies); \ | |
2463 | + {do} \ | |
2464 | + printk("\n"); \ | |
2465 | + BUG(); \ | |
2466 | + panic("DLM: Record message above and reboot.\n"); \ | |
2467 | + } \ | |
2468 | +} | |
2469 | + | |
2470 | + | |
2471 | +struct gd_ls; | |
2472 | +struct gd_lkb; | |
2473 | +struct gd_res; | |
2474 | +struct gd_csb; | |
2475 | +struct gd_node; | |
2476 | +struct gd_resmov; | |
2477 | +struct gd_resdata; | |
2478 | +struct gd_recover; | |
2479 | +struct gd_recinfo; | |
2480 | +struct gd_resdir_bucket; | |
2481 | +struct gd_remlockreply; | |
2482 | +struct gd_remlockrequest; | |
2483 | +struct gd_rcom; | |
2484 | + | |
2485 | +typedef struct gd_ls gd_ls_t; | |
2486 | +typedef struct gd_lkb gd_lkb_t; | |
2487 | +typedef struct gd_res gd_res_t; | |
2488 | +typedef struct gd_csb gd_csb_t; | |
2489 | +typedef struct gd_node gd_node_t; | |
2490 | +typedef struct gd_resmov gd_resmov_t; | |
2491 | +typedef struct gd_resdata gd_resdata_t; | |
2492 | +typedef struct gd_recover gd_recover_t; | |
2493 | +typedef struct gd_resdir_bucket gd_resdir_bucket_t; | |
2494 | +typedef struct gd_rcom gd_rcom_t; | |
2495 | + | |
2496 | +/* | |
2497 | + * Resource Data - an entry for a resource in the resdir hash table | |
2498 | + */ | |
2499 | + | |
2500 | +struct gd_resdata { | |
2501 | + struct list_head rd_list; | |
2502 | + uint32_t rd_master_nodeid; | |
2503 | + uint16_t rd_length; | |
2504 | + uint8_t rd_sequence; | |
2505 | + char rd_name[1]; /* <rd_length> bytes */ | |
2506 | +}; | |
2507 | + | |
2508 | +/* | |
2509 | + * Resource Directory Bucket - a hash bucket of resdata entries in the resdir | |
2510 | + * hash table | |
2511 | + */ | |
2512 | + | |
2513 | +struct gd_resdir_bucket { | |
2514 | + struct list_head rb_reslist; | |
2515 | + rwlock_t rb_lock; | |
2516 | +}; | |
2517 | + | |
2518 | +/* | |
2519 | + * A resource description as moved between nodes | |
2520 | + */ | |
2521 | + | |
2522 | +struct gd_resmov { | |
2523 | + uint32_t rm_nodeid; | |
2524 | + uint16_t rm_length; | |
2525 | + uint16_t rm_pad; | |
2526 | +}; | |
2527 | + | |
2528 | +/* | |
2529 | + * An entry in the lock ID table. Locks for this bucket are kept on list. | |
2530 | + * Counter is used to assign an id to locks as they are added to this bucket. | |
2531 | + */ | |
2532 | + | |
2533 | +struct gd_lockidtbl_entry { | |
2534 | + struct list_head list; | |
2535 | + uint16_t counter; | |
2536 | +}; | |
2537 | + | |
2538 | +/* Elements in the range array */ | |
2539 | + | |
2540 | +#define GR_RANGE_START 0 | |
2541 | +#define GR_RANGE_END 1 | |
2542 | +#define RQ_RANGE_START 2 | |
2543 | +#define RQ_RANGE_END 3 | |
2544 | + | |
2545 | +/* | |
2546 | + * Lockspace structure. The context for GDLM locks. | |
2547 | + */ | |
2548 | + | |
2549 | +#define RESHASHTBL_SIZE (256) | |
2550 | + | |
2551 | +#define RESDIRHASH_SHIFT (9) | |
2552 | +#define RESDIRHASH_SIZE (1 << RESDIRHASH_SHIFT) | |
2553 | +#define RESDIRHASH_MASK (RESDIRHASH_SIZE - 1) | |
2554 | + | |
2555 | +#define LSFL_WORK (0) | |
2556 | +#define LSFL_LS_RUN (1) | |
2557 | +#define LSFL_LS_STOP (2) | |
2558 | +#define LSFL_LS_START (3) | |
2559 | +#define LSFL_LS_FINISH (4) | |
2560 | +#define LSFL_RECCOMM_WAIT (5) | |
2561 | +#define LSFL_RECCOMM_READY (6) | |
2562 | +#define LSFL_NOTIMERS (7) | |
2563 | +#define LSFL_FINISH_RECOVERY (8) | |
2564 | +#define LSFL_RESDIR_VALID (9) | |
2565 | +#define LSFL_ALL_RESDIR_VALID (10) | |
2566 | +#define LSFL_NODES_VALID (11) | |
2567 | +#define LSFL_ALL_NODES_VALID (12) | |
2568 | +#define LSFL_REQUEST_WARN (13) | |
2569 | + | |
2570 | +#define LSST_NONE (0) | |
2571 | +#define LSST_INIT (1) | |
2572 | +#define LSST_INIT_DONE (2) | |
2573 | +#define LSST_CLEAR (3) | |
2574 | +#define LSST_WAIT_START (4) | |
2575 | +#define LSST_RECONFIG_DONE (5) | |
2576 | + | |
2577 | +struct gd_ls { | |
2578 | + struct list_head ls_list; /* list of lockspaces */ | |
2579 | + uint32_t ls_local_id; /* local unique lockspace ID */ | |
2580 | + uint32_t ls_global_id; /* global unique lockspace ID */ | |
2581 | + int ls_allocation; /* Memory allocation policy */ | |
2582 | + unsigned long ls_flags; /* LSFL_ */ | |
2583 | + | |
2584 | + struct list_head ls_rootres; /* List of root resources */ | |
2585 | + | |
2586 | + int ls_hashsize; | |
2587 | + int ls_hashmask; | |
2588 | + struct list_head *ls_reshashtbl; /* Hash table for resources */ | |
2589 | + rwlock_t ls_reshash_lock; /* Lock for hash table */ | |
2590 | + | |
2591 | + struct gd_lockidtbl_entry *ls_lockidtbl; | |
2592 | + uint32_t ls_lockidtbl_size; /* Size of lock id table */ | |
2593 | + rwlock_t ls_lockidtbl_lock; | |
2594 | + | |
2595 | + struct list_head ls_nodes; /* current nodes in RC */ | |
2596 | + uint32_t ls_num_nodes; /* number of nodes in RC */ | |
2597 | + uint32_t ls_nodes_mask; | |
2598 | + uint32_t ls_low_nodeid; | |
2599 | + | |
2600 | + int ls_state; /* state changes for recovery */ | |
2601 | + struct list_head ls_recover; /* gr_recover_t structs */ | |
2602 | + int ls_last_stop; /* event ids from sm */ | |
2603 | + int ls_last_start; | |
2604 | + int ls_last_finish; | |
2605 | + spinlock_t ls_recover_lock; | |
2606 | + struct list_head ls_nodes_gone; /* dead node list for recovery */ | |
2607 | + | |
2608 | + wait_queue_head_t ls_wait_general; | |
2609 | + | |
2610 | + gd_rcom_t *ls_rcom; | |
2611 | + uint32_t ls_rcom_msgid; | |
2612 | + struct semaphore ls_rcom_lock; | |
2613 | + | |
2614 | + struct list_head ls_recover_list; | |
2615 | + int ls_recover_list_count; | |
2616 | + spinlock_t ls_recover_list_lock; | |
2617 | + | |
2618 | + struct rw_semaphore ls_in_recovery; /* held in write during | |
2619 | + * recovery, read for normal | |
2620 | + * locking ops */ | |
2621 | + struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a | |
2622 | + * parent lock racing with a | |
2623 | + * new child lock */ | |
2624 | + | |
2625 | + struct rw_semaphore ls_rec_rsblist; /* To prevent incoming recovery | |
2626 | + * operations happening while | |
2627 | + * we are purging */ | |
2628 | + | |
2629 | + struct rw_semaphore ls_gap_rsblist; /* To protect rootres list | |
2630 | + * in grant_after_purge() which | |
2631 | + * runs outside recovery */ | |
2632 | + | |
2633 | + struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees | |
2634 | + * we are deserialising | |
2635 | + */ | |
2636 | + | |
2637 | + struct list_head ls_deadlockq; /* List of locks in conversion ordered | |
2638 | + * by duetime. for deadlock detection */ | |
2639 | + | |
2640 | + struct list_head ls_requestqueue; /* List of incoming requests | |
2641 | + * held while we are in | |
2642 | + * recovery */ | |
2643 | + | |
2644 | + gd_resdir_bucket_t ls_resdir_hash[RESDIRHASH_SIZE]; | |
2645 | + | |
2646 | + int ls_namelen; | |
2647 | + char ls_name[1]; /* <namelen> bytes */ | |
2648 | +}; | |
2649 | + | |
2650 | +/* | |
2651 | + * Cluster node (per node in cluster) | |
2652 | + */ | |
2653 | + | |
2654 | +struct gd_node { | |
2655 | + struct list_head gn_list; /* global list of cluster nodes */ | |
2656 | + uint32_t gn_nodeid; /* cluster unique nodeid (cman) */ | |
2657 | + uint32_t gn_ipaddr; /* node's first IP address (cman) */ | |
2658 | + int gn_refcount; /* number of csb's referencing */ | |
2659 | +}; | |
2660 | + | |
2661 | +/* | |
2662 | + * Cluster System Block (per node in a ls) | |
2663 | + */ | |
2664 | + | |
2665 | +struct gd_csb { | |
2666 | + struct list_head csb_list; /* per-lockspace list of nodes */ | |
2667 | + gd_node_t *csb_node; /* global node structure */ | |
2668 | + int csb_gone_event; /* event id when node was removed */ | |
2669 | + | |
2670 | + uint32_t csb_names_send_count; | |
2671 | + uint32_t csb_names_send_msgid; | |
2672 | + uint32_t csb_names_recv_count; | |
2673 | + uint32_t csb_names_recv_msgid; | |
2674 | + uint32_t csb_locks_send_count; | |
2675 | + uint32_t csb_locks_send_msgid; | |
2676 | + uint32_t csb_locks_recv_count; | |
2677 | + uint32_t csb_locks_recv_msgid; | |
2678 | +}; | |
2679 | + | |
2680 | +/* | |
2681 | + * Resource block | |
2682 | + */ | |
2683 | + | |
2684 | +/* status */ | |
2685 | + | |
2686 | +#define GDLM_RESSTS_DIRENTRY 1 /* This is a directory entry */ | |
2687 | +#define GDLM_RESSTS_LVBINVALID 2 /* The LVB is invalid */ | |
2688 | + | |
2689 | +#define RESFL_NEW_MASTER (0) | |
2690 | +#define RESFL_RECOVER_LIST (1) | |
2691 | + | |
2692 | +struct gd_res { | |
2693 | + struct list_head res_hashchain; /* Chain of resources in this hash | |
2694 | + * bucket */ | |
2695 | + | |
2696 | + gd_ls_t *res_ls; /* The owning lockspace */ | |
2697 | + | |
2698 | + struct list_head res_rootlist; /* List of root resources in lockspace */ | |
2699 | + | |
2700 | + struct list_head res_subreslist; /* List of all sub-resources | |
2701 | + * for this root res. */ | |
2702 | + /* This is a list head on the root res and holds the whole tree below | |
2703 | + * it. */ | |
2704 | + uint8_t res_depth; /* Depth in resource tree */ | |
2705 | + uint16_t res_status; | |
2706 | + unsigned long res_flags; /* Flags, RESFL_ */ | |
2707 | + | |
2708 | + struct list_head res_grantqueue; | |
2709 | + struct list_head res_convertqueue; | |
2710 | + struct list_head res_waitqueue; | |
2711 | + | |
2712 | + uint32_t res_nodeid; /* nodeid of master node */ | |
2713 | + | |
2714 | + gd_res_t *res_root; /* If a subresource, this is our root */ | |
2715 | + gd_res_t *res_parent; /* Our parent resource (if any) */ | |
2716 | + | |
2717 | + atomic_t res_ref; /* No of lkb's */ | |
2718 | + uint16_t res_remasterid; /* ID used during remaster */ | |
2719 | + struct list_head res_recover_list; /* General list for use during | |
2720 | + * recovery */ | |
2721 | + int res_recover_msgid; | |
2722 | + int res_newlkid_expect; | |
2723 | + | |
2724 | + struct rw_semaphore res_lock; | |
2725 | + | |
2726 | + char *res_lvbptr; /* Lock value block */ | |
2727 | + | |
2728 | + uint8_t res_resdir_seq; /* Last directory sequence number */ | |
2729 | + | |
2730 | + uint8_t res_length; | |
2731 | + char res_name[1]; /* <res_length> bytes */ | |
2732 | +}; | |
2733 | + | |
2734 | +/* | |
2735 | + * Lock block. To avoid confusion, where flags mirror the | |
2736 | + * public flags, they should have the same value. | |
2737 | + */ | |
2738 | + | |
2739 | +#define GDLM_LKSTS_NEW (0) | |
2740 | +#define GDLM_LKSTS_WAITING (1) | |
2741 | +#define GDLM_LKSTS_GRANTED (2) | |
2742 | +#define GDLM_LKSTS_CONVERT (3) | |
2743 | + | |
2744 | +#define GDLM_LKFLG_VALBLK (0x00000008) | |
2745 | +#define GDLM_LKFLG_PERSISTENT (0x00000080) /* Don't unlock when process exits */ | |
5cdbd17b AM |
2746 | +#define GDLM_LKFLG_NODLCKWT (0x00000100) /* Don't do deadlock detection */ |
2747 | +#define GDLM_LKFLG_EXPEDITE (0x00000400) /* Move to head of convert queue */ | |
4bf12011 | 2748 | + |
2749 | +/* Internal flags */ | |
5cdbd17b AM |
2750 | +#define GDLM_LKFLG_RANGE (0x00001000) /* Range field is present |
2751 | + (remote protocol only) */ | |
4bf12011 | 2752 | +#define GDLM_LKFLG_MSTCPY (0x00002000) |
2753 | +#define GDLM_LKFLG_DELETED (0x00004000) /* LKB is being deleted */ | |
5cdbd17b | 2754 | +#define GDLM_LKFLG_LQCONVERT (0x00008000) |
4bf12011 | 2755 | +#define GDLM_LKFLG_LQRESEND (0x00010000) /* LKB on lockqueue must be resent */ |
2756 | +#define GDLM_LKFLG_DEMOTED (0x00020000) | |
2757 | +#define GDLM_LKFLG_RESENT (0x00040000) | |
2758 | +#define GDLM_LKFLG_NOREBUILD (0x00080000) | |
4bf12011 | 2759 | + |
5cdbd17b AM |
2760 | +#define AST_COMP (1) |
2761 | +#define AST_BAST (2) | |
2762 | +#define AST_DEL (4) | |
4bf12011 | 2763 | + |
5cdbd17b AM |
2764 | +struct gd_lkb { |
2765 | + uint32_t lkb_flags; | |
2766 | + uint16_t lkb_status; /* grant, wait, convert */ | |
2767 | + int8_t lkb_rqmode; /* requested lock mode */ | |
2768 | + int8_t lkb_grmode; /* granted lock mode */ | |
2769 | + uint32_t lkb_retstatus; /* status to return in lksb */ | |
2770 | + uint32_t lkb_id; /* our lock ID */ | |
2771 | + struct dlm_lksb * lkb_lksb; /* status block of caller */ | |
2772 | + struct list_head lkb_idtbl_list; /* lockidtbl */ | |
2773 | + struct list_head lkb_statequeue; /* rsb's g/c/w queue */ | |
2774 | + gd_res_t * lkb_resource; | |
2775 | + struct list_head lkb_ownerqueue; /* list of locks owned by a | |
2776 | + process */ | |
2777 | + gd_lkb_t * lkb_parent; /* parent lock if any */ | |
2778 | + atomic_t lkb_childcnt; /* number of children */ | |
2779 | + | |
2780 | + struct list_head lkb_lockqueue; /* queue of locks waiting | |
2781 | + for remote reply */ | |
2782 | + int lkb_lockqueue_state; /* reason on lockqueue */ | |
2783 | + int lkb_lockqueue_flags; /* as passed into | |
2784 | + lock/unlock */ | |
2785 | + unsigned long lkb_lockqueue_time; /* time lkb went on the | |
2786 | + lockqueue */ | |
2787 | + unsigned long lkb_duetime; /* for deadlock detection */ | |
2788 | + | |
2789 | + uint32_t lkb_remid; /* id on remote partner */ | |
2790 | + uint32_t lkb_nodeid; /* id of remote partner */ | |
2791 | + | |
2792 | + void * lkb_astaddr; | |
2793 | + void * lkb_bastaddr; | |
2794 | + long lkb_astparam; | |
2795 | + struct list_head lkb_astqueue; /* locks with asts to deliver */ | |
2796 | + uint16_t lkb_astflags; /* COMP, BAST, DEL */ | |
2797 | + uint8_t lkb_bastmode; /* requested mode */ | |
2798 | + uint8_t lkb_highbast; /* highest mode bast sent for */ | |
4bf12011 | 2799 | + |
2800 | + struct gd_remlockrequest *lkb_request; | |
2801 | + | |
5cdbd17b | 2802 | + struct list_head lkb_deadlockq; /* ls_deadlockq list */ |
4bf12011 | 2803 | + |
5cdbd17b AM |
2804 | + char * lkb_lvbptr; /* points to lksb lvb on local |
2805 | + lock, allocated lvb on | |
2806 | + on remote lock */ | |
2807 | + uint64_t * lkb_range; /* Points to an array of 64 bit | |
2808 | + numbers that represent the | |
2809 | + requested and granted ranges | |
2810 | + of the lock. NULL implies | |
2811 | + 0-ffffffffffffffff */ | |
4bf12011 | 2812 | +}; |
2813 | + | |
2814 | +/* | |
2815 | + * Used to save and manage recovery state for a lockspace. | |
2816 | + */ | |
2817 | + | |
2818 | +struct gd_recover { | |
2819 | + struct list_head gr_list; | |
2820 | + uint32_t *gr_nodeids; | |
2821 | + int gr_node_count; | |
2822 | + int gr_event_id; | |
2823 | +}; | |
2824 | + | |
2825 | +/* | |
2826 | + * Header part of the mid-level comms system. All packets start with | |
2827 | + * this header so we can identify them. The comms packet can | |
2828 | + * contain many of these structs but the are split into individual | |
2829 | + * work units before being passed to the lockqueue routines. | |
2830 | + * below this are the structs that this is a header for | |
2831 | + */ | |
2832 | + | |
2833 | +struct gd_req_header { | |
2834 | + uint8_t rh_cmd; /* What we are */ | |
2835 | + uint8_t rh_flags; /* maybe just a pad */ | |
2836 | + uint16_t rh_length; /* Length of struct (so we can send several in | |
2837 | + * one message) */ | |
2838 | + uint32_t rh_lkid; /* Lock ID tag: ie the local (requesting) lock | |
2839 | + * ID */ | |
2840 | + uint32_t rh_lockspace; /* Lockspace ID */ | |
2841 | +}; | |
2842 | + | |
2843 | +/* | |
2844 | + * This is the struct used in a remote lock/unlock/convert request | |
2845 | + * The mid-level comms API should turn this into native byte order. | |
2846 | + * Most "normal" lock operations will use these two structs for | |
2847 | + * communications. Recovery operations use their own structs | |
2848 | + * but still with the gd_req_header on the front. | |
2849 | + */ | |
2850 | + | |
2851 | +struct gd_remlockrequest { | |
2852 | + struct gd_req_header rr_header; | |
2853 | + | |
2854 | + uint32_t rr_remlkid; /* Remote lock ID */ | |
2855 | + uint32_t rr_remparid; /* Parent's remote lock ID or 0 */ | |
2856 | + uint32_t rr_flags; /* Flags from lock/convert request */ | |
2857 | + uint64_t rr_range_start;/* Yes, these are in the right place... */ | |
2858 | + uint64_t rr_range_end; | |
2859 | + uint32_t rr_status; /* Status to return if this is an AST request */ | |
2860 | + uint8_t rr_rqmode; /* Requested lock mode */ | |
2861 | + uint8_t rr_asts; /* Whether the LKB has ASTs or not */ | |
2862 | + uint8_t rr_resdir_seq; /* Directory sequence number */ | |
2863 | + char rr_lvb[DLM_LVB_LEN]; /* Value block */ | |
2864 | + char rr_name[1]; /* As long as needs be. Only used for directory | |
2865 | + * lookups. The length of this can be worked | |
2866 | + * out from the packet length */ | |
2867 | +}; | |
2868 | + | |
2869 | +/* | |
2870 | + * This is the struct returned by a remote lock/unlock/convert request | |
2871 | + * The mid-level comms API should turn this into native byte order. | |
2872 | + */ | |
2873 | + | |
2874 | +struct gd_remlockreply { | |
2875 | + struct gd_req_header rl_header; | |
2876 | + | |
2877 | + uint32_t rl_lockstate; /* Whether request was queued/granted/waiting */ | |
2878 | + uint32_t rl_nodeid; /* nodeid of lock master */ | |
2879 | + uint32_t rl_status; /* Status to return to caller */ | |
2880 | + uint32_t rl_lkid; /* Remote lkid */ | |
2881 | + uint8_t rl_resdir_seq; /* Returned directory sequence number */ | |
2882 | + char rl_lvb[DLM_LVB_LEN]; /* LVB itself */ | |
2883 | +}; | |
2884 | + | |
2885 | +/* | |
2886 | + * Recovery comms message | |
2887 | + */ | |
2888 | + | |
2889 | +struct gd_rcom { | |
2890 | + struct gd_req_header rc_header; /* 32 byte aligned */ | |
2891 | + uint32_t rc_msgid; | |
2892 | + uint16_t rc_datalen; | |
2893 | + uint8_t rc_expanded; | |
2894 | + uint8_t rc_subcmd; /* secondary command */ | |
2895 | + char rc_buf[1]; /* first byte of data goes here and extends | |
2896 | + * beyond here for another datalen - 1 bytes. | |
2897 | + * rh_length is set to sizeof(gd_rcom_t) + | |
2898 | + * datalen - 1 */ | |
2899 | +}; | |
2900 | + | |
2901 | + | |
2902 | +/* A remote query: GDLM_REMCMD_QUERY */ | |
2903 | +struct gd_remquery { | |
2904 | + struct gd_req_header rq_header; | |
2905 | + | |
2906 | + uint32_t rq_mstlkid; /* LockID on master node */ | |
2907 | + uint32_t rq_query; /* query from the user */ | |
2908 | + uint32_t rq_maxlocks; /* max number of locks we can cope with */ | |
2909 | +}; | |
2910 | + | |
2911 | +/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */ | |
2912 | +/* There may be subsequent blocks of | |
2913 | + lock info in GDLM_REMCMD_QUERYCONT messages which just have | |
2914 | + a normal header. The last of these will have rh_flags set to | |
2915 | + GDLM_REMFLAG_ENDQUERY | |
2916 | + */ | |
2917 | +struct gd_remqueryreply { | |
2918 | + struct gd_req_header rq_header; | |
2919 | + | |
2920 | + uint32_t rq_numlocks; /* Number of locks in reply */ | |
2921 | + uint32_t rq_startlock; /* Which lock this block starts at (for multiple block replies) */ | |
2922 | + uint32_t rq_status; | |
2923 | + | |
2924 | + /* Resource information */ | |
2925 | + uint32_t rq_grantcount; /* No. of nodes on grant queue */ | |
2926 | + uint32_t rq_convcount; /* No. of nodes on convert queue */ | |
2927 | + uint32_t rq_waitcount; /* No. of nodes on wait queue */ | |
2928 | + char rq_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable */ | |
2929 | +}; | |
2930 | + | |
2931 | +/* | |
2932 | + * Lockqueue wait lock states | |
2933 | + */ | |
2934 | + | |
2935 | +#define GDLM_LQSTATE_WAIT_RSB 1 | |
2936 | +#define GDLM_LQSTATE_WAIT_CONVERT 2 | |
2937 | +#define GDLM_LQSTATE_WAIT_CONDGRANT 3 | |
2938 | +#define GDLM_LQSTATE_WAIT_UNLOCK 4 | |
2939 | + | |
2940 | +/* Commands sent across the comms link */ | |
2941 | +#define GDLM_REMCMD_LOOKUP 1 | |
2942 | +#define GDLM_REMCMD_LOCKREQUEST 2 | |
2943 | +#define GDLM_REMCMD_UNLOCKREQUEST 3 | |
2944 | +#define GDLM_REMCMD_CONVREQUEST 4 | |
2945 | +#define GDLM_REMCMD_LOCKREPLY 5 | |
2946 | +#define GDLM_REMCMD_LOCKGRANT 6 | |
2947 | +#define GDLM_REMCMD_SENDBAST 7 | |
2948 | +#define GDLM_REMCMD_SENDCAST 8 | |
2949 | +#define GDLM_REMCMD_REM_RESDATA 9 | |
2950 | +#define GDLM_REMCMD_RECOVERMESSAGE 20 | |
2951 | +#define GDLM_REMCMD_RECOVERREPLY 21 | |
2952 | +#define GDLM_REMCMD_QUERY 30 | |
2953 | +#define GDLM_REMCMD_QUERYREPLY 31 | |
2954 | + | |
2955 | +/* Set in rh_flags when this is the last block of | |
2956 | + query information. Note this could also be the first | |
2957 | + block */ | |
2958 | +#define GDLM_REMFLAG_ENDQUERY 1 | |
2959 | + | |
4bf12011 | 2960 | +#ifndef BUG_ON |
2961 | +#define BUG_ON(x) | |
2962 | +#endif | |
2963 | + | |
2964 | +void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...); | |
2965 | +void dlm_debug_dump(void); | |
2966 | + | |
2967 | +#endif /* __DLM_INTERNAL_DOT_H__ */ | |
2968 | diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c | |
2969 | --- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 2970 | +++ linux-patched/cluster/dlm/lkb.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 2971 | @@ -0,0 +1,225 @@ |
2972 | +/****************************************************************************** | |
2973 | +******************************************************************************* | |
2974 | +** | |
2975 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
2976 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
2977 | +** | |
2978 | +** This copyrighted material is made available to anyone wishing to use, | |
2979 | +** modify, copy, or redistribute it subject to the terms and conditions | |
2980 | +** of the GNU General Public License v.2. | |
2981 | +** | |
2982 | +******************************************************************************* | |
2983 | +******************************************************************************/ | |
2984 | + | |
2985 | +/* | |
2986 | + * lkb.c | |
2987 | + * | |
2988 | + * Allocate and free locks on the lock ID table. | |
2989 | + * | |
2990 | + * This is slightly naff but I don't really like the | |
2991 | + * VMS lockidtbl stuff as it uses a realloced array | |
2992 | + * to hold the locks in. I think this is slightly better | |
2993 | + * in some ways. | |
2994 | + * | |
2995 | + * Any better suggestions gratefully received. Patrick | |
2996 | + * | |
2997 | + */ | |
2998 | + | |
2999 | +#include "dlm_internal.h" | |
3000 | +#include "lockqueue.h" | |
3001 | +#include "lkb.h" | |
3002 | +#include "config.h" | |
3003 | +#include "rsb.h" | |
3004 | +#include "memory.h" | |
3005 | +#include "lockspace.h" | |
3006 | +#include "util.h" | |
3007 | + | |
3008 | +/* | |
3009 | + * Internal find lock by ID. Must be called with the lockidtbl spinlock held. | |
3010 | + */ | |
3011 | + | |
3012 | +static gd_lkb_t *__find_lock_by_id(gd_ls_t *ls, uint32_t lkid) | |
3013 | +{ | |
3014 | + uint16_t entry = lkid & 0xFFFF; | |
3015 | + gd_lkb_t *lkb; | |
3016 | + | |
3017 | + if (entry >= ls->ls_lockidtbl_size) | |
3018 | + goto out; | |
3019 | + | |
3020 | + list_for_each_entry(lkb, &ls->ls_lockidtbl[entry].list, lkb_idtbl_list){ | |
3021 | + if (lkb->lkb_id == lkid) | |
3022 | + return lkb; | |
3023 | + } | |
3024 | + | |
3025 | + out: | |
3026 | + return NULL; | |
3027 | +} | |
3028 | + | |
3029 | +/* | |
3030 | + * Should be called at lockspace initialisation time. | |
3031 | + */ | |
3032 | + | |
3033 | +int init_lockidtbl(gd_ls_t *ls, int entries) | |
3034 | +{ | |
3035 | + int i; | |
3036 | + | |
3037 | + /* Make sure it's a power of two */ | |
3038 | + GDLM_ASSERT(!(entries & (entries - 1)),); | |
3039 | + | |
3040 | + ls->ls_lockidtbl_size = entries; | |
3041 | + rwlock_init(&ls->ls_lockidtbl_lock); | |
3042 | + | |
3043 | + ls->ls_lockidtbl = kmalloc(entries * sizeof(struct gd_lockidtbl_entry), | |
3044 | + GFP_KERNEL); | |
3045 | + if (!ls->ls_lockidtbl) | |
3046 | + return -ENOMEM; | |
3047 | + | |
3048 | + for (i = 0; i < entries; i++) { | |
3049 | + INIT_LIST_HEAD(&ls->ls_lockidtbl[i].list); | |
3050 | + ls->ls_lockidtbl[i].counter = 1; | |
3051 | + } | |
3052 | + | |
3053 | + return 0; | |
3054 | +} | |
3055 | + | |
3056 | +/* | |
3057 | + * Free up the space - returns an error if there are still locks hanging around | |
3058 | + */ | |
3059 | + | |
3060 | +int free_lockidtbl(gd_ls_t *ls) | |
3061 | +{ | |
3062 | + int i; | |
3063 | + | |
3064 | + write_lock(&ls->ls_lockidtbl_lock); | |
3065 | + | |
3066 | + for (i = 0; i < ls->ls_lockidtbl_size; i++) { | |
3067 | + if (!list_empty(&ls->ls_lockidtbl[i].list)) { | |
3068 | + write_unlock(&ls->ls_lockidtbl_lock); | |
3069 | + return -1; | |
3070 | + } | |
3071 | + } | |
3072 | + kfree(ls->ls_lockidtbl); | |
3073 | + | |
3074 | + write_unlock(&ls->ls_lockidtbl_lock); | |
3075 | + | |
3076 | + return 0; | |
3077 | +} | |
3078 | + | |
3079 | +/* | |
3080 | + * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a | |
3081 | + * random number between 0 and lockidtbl_size-1. This random number specifies | |
3082 | + * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially | |
3083 | + * assigned per-bucket id. | |
3084 | + * | |
3085 | + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked | |
3086 | + * against the lkid of all lkb's in the bucket to avoid duplication. | |
3087 | + * | |
3088 | + */ | |
3089 | + | |
3090 | +gd_lkb_t *create_lkb(gd_ls_t *ls) | |
3091 | +{ | |
3092 | + gd_lkb_t *lkb; | |
3093 | + uint32_t lkid; | |
3094 | + uint16_t bucket; | |
3095 | + | |
3096 | + lkb = allocate_lkb(ls); | |
3097 | + if (!lkb) | |
3098 | + goto out; | |
3099 | + | |
3100 | + write_lock(&ls->ls_lockidtbl_lock); | |
3101 | + do { | |
3102 | + get_random_bytes(&bucket, sizeof(bucket)); | |
3103 | + bucket &= (ls->ls_lockidtbl_size - 1); | |
3104 | + lkid = bucket | (ls->ls_lockidtbl[bucket].counter++ << 16); | |
3105 | + } | |
3106 | + while (__find_lock_by_id(ls, lkid)); | |
3107 | + | |
3108 | + lkb->lkb_id = (uint32_t) lkid; | |
3109 | + list_add(&lkb->lkb_idtbl_list, &ls->ls_lockidtbl[bucket].list); | |
3110 | + write_unlock(&ls->ls_lockidtbl_lock); | |
3111 | + | |
3112 | + out: | |
3113 | + return lkb; | |
3114 | +} | |
3115 | + | |
3116 | +/* | |
3117 | + * Free LKB and remove it from the lockidtbl. | |
3118 | + * NB - this always frees the lkb whereas release_rsb doesn't free an | |
3119 | + * rsb unless its reference count is zero. | |
3120 | + */ | |
3121 | + | |
3122 | +void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb) | |
3123 | +{ | |
3124 | + if (lkb->lkb_status) { | |
3125 | + log_error(ls, "release lkb with status %u", lkb->lkb_status); | |
3126 | + print_lkb(lkb); | |
3127 | + return; | |
3128 | + } | |
3129 | + | |
3130 | + if (lkb->lkb_parent) | |
3131 | + atomic_dec(&lkb->lkb_parent->lkb_childcnt); | |
3132 | + | |
3133 | + write_lock(&ls->ls_lockidtbl_lock); | |
3134 | + list_del(&lkb->lkb_idtbl_list); | |
3135 | + write_unlock(&ls->ls_lockidtbl_lock); | |
3136 | + | |
3137 | + /* if this is not a master copy then lvbptr points into the user's | |
3138 | + * lksb, so don't free it */ | |
3139 | + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY) | |
3140 | + free_lvb(lkb->lkb_lvbptr); | |
3141 | + | |
3142 | + if (lkb->lkb_range) | |
3143 | + free_range(lkb->lkb_range); | |
3144 | + | |
3145 | + free_lkb(lkb); | |
3146 | +} | |
3147 | + | |
3148 | +gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid) | |
3149 | +{ | |
3150 | + gd_lkb_t *lkb; | |
3151 | + | |
3152 | + read_lock(&ls->ls_lockidtbl_lock); | |
3153 | + lkb = __find_lock_by_id(ls, lkid); | |
3154 | + read_unlock(&ls->ls_lockidtbl_lock); | |
3155 | + | |
3156 | + return lkb; | |
3157 | +} | |
3158 | + | |
3159 | +gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid) | |
3160 | +{ | |
3161 | + gd_ls_t *lspace = find_lockspace_by_local_id(ls); | |
3162 | + return find_lock_by_id(lspace, lkid); | |
3163 | +} | |
3164 | + | |
3165 | +/* | |
3166 | + * Initialise the range parts of an LKB. | |
3167 | + */ | |
3168 | + | |
3169 | +int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end) | |
3170 | +{ | |
3171 | + int ret = -ENOMEM; | |
3172 | + | |
3173 | + /* | |
3174 | + * if this wasn't already a range lock, make it one | |
3175 | + */ | |
3176 | + if (!lkb->lkb_range) { | |
3177 | + lkb->lkb_range = allocate_range(lspace); | |
3178 | + if (!lkb->lkb_range) | |
3179 | + goto out; | |
3180 | + | |
3181 | + /* | |
3182 | + * This is needed for conversions that contain ranges where the | |
3183 | + * original lock didn't but it's harmless for new locks too. | |
3184 | + */ | |
3185 | + lkb->lkb_range[GR_RANGE_START] = 0LL; | |
3186 | + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL; | |
3187 | + } | |
3188 | + | |
3189 | + lkb->lkb_range[RQ_RANGE_START] = start; | |
3190 | + lkb->lkb_range[RQ_RANGE_END] = end; | |
3191 | + | |
3192 | + ret = 0; | |
3193 | + | |
3194 | + out: | |
3195 | + return ret; | |
3196 | +} | |
3197 | diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h | |
3198 | --- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 3199 | +++ linux-patched/cluster/dlm/lkb.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 3200 | @@ -0,0 +1,27 @@ |
3201 | +/****************************************************************************** | |
3202 | +******************************************************************************* | |
3203 | +** | |
3204 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
3205 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
3206 | +** | |
3207 | +** This copyrighted material is made available to anyone wishing to use, | |
3208 | +** modify, copy, or redistribute it subject to the terms and conditions | |
3209 | +** of the GNU General Public License v.2. | |
3210 | +** | |
3211 | +******************************************************************************* | |
3212 | +******************************************************************************/ | |
3213 | + | |
3214 | +#ifndef __LKB_DOT_H__ | |
3215 | +#define __LKB_DOT_H__ | |
3216 | + | |
3217 | +int free_lockidtbl(gd_ls_t * lspace); | |
3218 | +int init_lockidtbl(gd_ls_t * lspace, int entries); | |
3219 | + | |
3220 | +gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid); | |
3221 | +gd_lkb_t *create_lkb(gd_ls_t *ls); | |
3222 | +void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb); | |
3223 | +gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid); | |
3224 | +int verify_lkb_nodeids(gd_ls_t *ls); | |
3225 | +int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end); | |
3226 | + | |
3227 | +#endif /* __LKB_DOT_H__ */ | |
3228 | diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c | |
3229 | --- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b AM |
3230 | +++ linux-patched/cluster/dlm/locking.c 2004-06-29 20:01:20.000000000 +0800 |
3231 | @@ -0,0 +1,1223 @@ | |
4bf12011 | 3232 | +/****************************************************************************** |
3233 | +******************************************************************************* | |
3234 | +** | |
3235 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
3236 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
3237 | +** | |
3238 | +** This copyrighted material is made available to anyone wishing to use, | |
3239 | +** modify, copy, or redistribute it subject to the terms and conditions | |
3240 | +** of the GNU General Public License v.2. | |
3241 | +** | |
3242 | +******************************************************************************* | |
3243 | +******************************************************************************/ | |
3244 | + | |
3245 | +/* | |
3246 | + * locking.c | |
3247 | + * | |
3248 | + * This is where the main work of the DLM goes on | |
3249 | + * | |
3250 | + */ | |
3251 | + | |
3252 | +#include "dlm_internal.h" | |
3253 | +#include "lockqueue.h" | |
3254 | +#include "locking.h" | |
3255 | +#include "lockspace.h" | |
3256 | +#include "lkb.h" | |
3257 | +#include "nodes.h" | |
3258 | +#include "dir.h" | |
3259 | +#include "ast.h" | |
3260 | +#include "memory.h" | |
3261 | +#include "rsb.h" | |
3262 | + | |
3263 | +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) | |
3264 | + | |
3265 | +/* | |
3266 | + * Lock compatibilty matrix - thanks Steve | |
3267 | + * UN = Unlocked state. Not really a state, used as a flag | |
3268 | + * PD = Padding. Used to make the matrix a nice power of two in size | |
3269 | + * Other states are the same as the VMS DLM. | |
3270 | + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same) | |
3271 | + */ | |
3272 | + | |
3273 | +#define modes_compat(gr, rq) \ | |
3274 | + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1] | |
3275 | + | |
3276 | +const int __dlm_compat_matrix[8][8] = { | |
3277 | + /* UN NL CR CW PR PW EX PD */ | |
3278 | + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ | |
3279 | + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ | |
3280 | + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */ | |
3281 | + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */ | |
3282 | + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */ | |
3283 | + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */ | |
3284 | + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */ | |
3285 | + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ | |
3286 | +}; | |
3287 | + | |
3288 | +/* | |
3289 | + * Compatibility matrix for conversions with QUECVT set. | |
3290 | + * Granted mode is the row; requested mode is the column. | |
3291 | + * Usage: matrix[grmode+1][rqmode+1] | |
3292 | + */ | |
3293 | + | |
3294 | +const int __quecvt_compat_matrix[8][8] = { | |
3295 | + /* UN NL CR CW PR PW EX PD */ | |
3296 | + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */ | |
3297 | + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */ | |
3298 | + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */ | |
3299 | + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */ | |
3300 | + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */ | |
3301 | + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */ | |
3302 | + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */ | |
3303 | + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ | |
3304 | +}; | |
3305 | + | |
3306 | +/* | |
3307 | + * This defines the direction of transfer of LVB data. | |
3308 | + * Granted mode is the row; requested mode is the column. | |
3309 | + * Usage: matrix[grmode+1][rqmode+1] | |
3310 | + * 1 = LVB is returned to the caller | |
3311 | + * 0 = LVB is written to the resource | |
3312 | + * -1 = nothing happens to the LVB | |
3313 | + */ | |
3314 | + | |
3315 | +const int __lvb_operations[8][8] = { | |
3316 | + /* UN NL CR CW PR PW EX PD*/ | |
3317 | + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */ | |
3318 | + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */ | |
3319 | + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */ | |
3320 | + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */ | |
3321 | + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */ | |
3322 | + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */ | |
3323 | + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */ | |
3324 | + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */ | |
3325 | +}; | |
3326 | + | |
3327 | +static void grant_lock(gd_lkb_t * lkb, int send_remote); | |
3328 | +static void send_blocking_asts(gd_res_t * rsb, gd_lkb_t * lkb); | |
3329 | +static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb); | |
3330 | +static int convert_lock(gd_ls_t * ls, int mode, struct dlm_lksb *lksb, | |
3331 | + int flags, void *ast, void *astarg, void *bast, | |
3332 | + struct dlm_range *range); | |
3333 | +static int dlm_lock_stage1(gd_ls_t * lspace, gd_lkb_t * lkb, int flags, | |
3334 | + char *name, int namelen); | |
3335 | + | |
3336 | + | |
3337 | +static inline int first_in_list(gd_lkb_t *lkb, struct list_head *head) | |
3338 | +{ | |
3339 | + gd_lkb_t *first = list_entry(head->next, gd_lkb_t, lkb_statequeue); | |
3340 | + | |
3341 | + if (lkb->lkb_id == first->lkb_id) | |
3342 | + return 1; | |
3343 | + | |
3344 | + return 0; | |
3345 | +} | |
3346 | + | |
3347 | +/* | |
3348 | + * Return 1 if the locks' ranges overlap | |
3349 | + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff | |
3350 | + */ | |
3351 | + | |
3352 | +static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2) | |
3353 | +{ | |
3354 | + if (!lkb1->lkb_range || !lkb2->lkb_range) | |
3355 | + return 1; | |
3356 | + | |
3357 | + if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] || | |
3358 | + lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END]) | |
3359 | + return 0; | |
3360 | + | |
3361 | + return 1; | |
3362 | +} | |
3363 | + | |
3364 | +/* | |
3365 | + * Resolve conversion deadlock by changing to NL the granted mode of deadlocked | |
3366 | + * locks on the convert queue. One of the deadlocked locks is allowed to | |
3367 | + * retain its original granted state (we choose the lkb provided although it | |
3368 | + * shouldn't matter which.) We do not change the granted mode on locks without | |
3369 | + * the CONVDEADLK flag. If any of these exist (there shouldn't if the app uses | |
3370 | + * the flag consistently) the false return value is used. | |
3371 | + */ | |
3372 | + | |
3373 | +static int conversion_deadlock_resolve(gd_res_t *rsb, gd_lkb_t *lkb) | |
3374 | +{ | |
3375 | + gd_lkb_t *this; | |
3376 | + int rv = TRUE; | |
3377 | + | |
3378 | + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) { | |
3379 | + if (this == lkb) | |
3380 | + continue; | |
3381 | + | |
3382 | + if (!ranges_overlap(lkb, this)) | |
3383 | + continue; | |
3384 | + | |
3385 | + if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) { | |
3386 | + | |
3387 | + if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){ | |
3388 | + rv = FALSE; | |
3389 | + continue; | |
3390 | + } | |
3391 | + this->lkb_grmode = DLM_LOCK_NL; | |
3392 | + this->lkb_flags |= GDLM_LKFLG_DEMOTED; | |
3393 | + } | |
3394 | + } | |
3395 | + return rv; | |
3396 | +} | |
3397 | + | |
3398 | +/* | |
3399 | + * "A conversion deadlock arises with a pair of lock requests in the converting | |
3400 | + * queue for one resource. The granted mode of each lock blocks the requested | |
3401 | + * mode of the other lock." | |
3402 | + */ | |
3403 | + | |
3404 | +static int conversion_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb) | |
3405 | +{ | |
3406 | + gd_lkb_t *this; | |
3407 | + | |
3408 | + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) { | |
3409 | + if (this == lkb) | |
3410 | + continue; | |
3411 | + | |
3412 | + if (!ranges_overlap(lkb, this)) | |
3413 | + continue; | |
3414 | + | |
3415 | + if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) | |
3416 | + return TRUE; | |
3417 | + } | |
3418 | + return FALSE; | |
3419 | +} | |
3420 | + | |
3421 | +/* | |
3422 | + * Check if the given lkb conflicts with another lkb on the queue. | |
3423 | + */ | |
3424 | + | |
3425 | +static int queue_conflict(struct list_head *head, gd_lkb_t *lkb) | |
3426 | +{ | |
3427 | + gd_lkb_t *this; | |
3428 | + | |
3429 | + list_for_each_entry(this, head, lkb_statequeue) { | |
3430 | + if (this == lkb) | |
3431 | + continue; | |
3432 | + if (ranges_overlap(lkb, this) && !modes_compat(this, lkb)) | |
3433 | + return TRUE; | |
3434 | + } | |
3435 | + return FALSE; | |
3436 | +} | |
3437 | + | |
3438 | +/* | |
3439 | + * Deadlock can arise when using the QUECVT flag if the requested mode of the | |
3440 | + * first converting lock is incompatible with the granted mode of another | |
3441 | + * converting lock further down the queue. To prevent this deadlock, a | |
3442 | + * requested QUEUECVT lock is granted immediately if adding it to the end of | |
3443 | + * the queue would prevent a lock ahead of it from being granted. | |
3444 | + */ | |
3445 | + | |
3446 | +static int queuecvt_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb) | |
3447 | +{ | |
3448 | + gd_lkb_t *this; | |
3449 | + | |
3450 | + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) { | |
3451 | + if (this == lkb) | |
3452 | + break; | |
3453 | + | |
3454 | + if (ranges_overlap(lkb, this) && !modes_compat(lkb, this)) | |
3455 | + return TRUE; | |
3456 | + } | |
3457 | + return FALSE; | |
3458 | +} | |
3459 | + | |
3460 | +/* | |
3461 | + * Return 1 if the lock can be granted, 0 otherwise. | |
3462 | + * Also detect and resolve conversion deadlocks. | |
3463 | + */ | |
3464 | + | |
3465 | +static int can_be_granted(gd_res_t *rsb, gd_lkb_t *lkb) | |
3466 | +{ | |
3467 | + if (lkb->lkb_rqmode == DLM_LOCK_NL) | |
3468 | + return TRUE; | |
3469 | + | |
3470 | + if (lkb->lkb_rqmode == lkb->lkb_grmode) | |
3471 | + return TRUE; | |
3472 | + | |
3473 | + if (queue_conflict(&rsb->res_grantqueue, lkb)) | |
3474 | + return FALSE; | |
3475 | + | |
3476 | + if (!queue_conflict(&rsb->res_convertqueue, lkb)) { | |
3477 | + if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)) | |
3478 | + return TRUE; | |
3479 | + | |
3480 | + if (list_empty(&rsb->res_convertqueue) || | |
3481 | + first_in_list(lkb, &rsb->res_convertqueue) || | |
3482 | + queuecvt_deadlock_detect(rsb, lkb)) | |
3483 | + return TRUE; | |
3484 | + else | |
3485 | + return FALSE; | |
3486 | + } | |
3487 | + | |
3488 | + /* there *is* a conflict between this lkb and a converting lock so | |
3489 | + we return false unless conversion deadlock resolution is permitted | |
3490 | + (only conversion requests will have the CONVDEADLK flag set) */ | |
3491 | + | |
3492 | + if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)) | |
3493 | + return FALSE; | |
3494 | + | |
3495 | + if (!conversion_deadlock_detect(rsb, lkb)) | |
3496 | + return FALSE; | |
3497 | + | |
3498 | + if (conversion_deadlock_resolve(rsb, lkb)) | |
3499 | + return TRUE; | |
3500 | + | |
3501 | + return FALSE; | |
3502 | +} | |
3503 | + | |
3504 | +int dlm_lock(void *lockspace, | |
3505 | + uint32_t mode, | |
3506 | + struct dlm_lksb *lksb, | |
3507 | + uint32_t flags, | |
3508 | + void *name, | |
3509 | + unsigned int namelen, | |
3510 | + uint32_t parent, | |
3511 | + void (*ast) (void *astarg), | |
3512 | + void *astarg, | |
3513 | + void (*bast) (void *astarg, int mode), | |
3514 | + struct dlm_range *range) | |
3515 | +{ | |
3516 | + gd_ls_t *lspace; | |
3517 | + gd_lkb_t *lkb = NULL, *parent_lkb = NULL; | |
3518 | + int ret = -EINVAL; | |
3519 | + | |
3520 | + lspace = find_lockspace_by_local_id(lockspace); | |
3521 | + if (!lspace) | |
3522 | + goto out; | |
3523 | + | |
3524 | + if (mode < 0 || mode > DLM_LOCK_EX) | |
3525 | + goto out; | |
3526 | + | |
3527 | + if (namelen > DLM_RESNAME_MAXLEN) | |
3528 | + goto out; | |
3529 | + | |
3530 | + if (flags & DLM_LKF_CANCEL) | |
3531 | + goto out; | |
3532 | + | |
3533 | + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT)) | |
3534 | + goto out; | |
3535 | + | |
3536 | + if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT)) | |
3537 | + goto out; | |
3538 | + | |
3539 | + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT) | |
3540 | + goto out; | |
3541 | + | |
3542 | + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE) | |
3543 | + goto out; | |
3544 | + | |
3545 | + if (!ast || !lksb) | |
3546 | + goto out; | |
3547 | + | |
3548 | + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) | |
3549 | + goto out; | |
3550 | + | |
3551 | + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) | |
3552 | + goto out; | |
3553 | + | |
3554 | + /* | |
3555 | + * Take conversion path. | |
3556 | + */ | |
3557 | + | |
3558 | + if (flags & DLM_LKF_CONVERT) { | |
3559 | + ret = convert_lock(lspace, mode, lksb, flags, ast, astarg, | |
3560 | + bast, range); | |
3561 | + goto out; | |
3562 | + } | |
3563 | + | |
3564 | + /* | |
3565 | + * Take new lock path. | |
3566 | + */ | |
3567 | + | |
3568 | + if (parent) { | |
3569 | + down_read(&lspace->ls_unlock_sem); | |
3570 | + | |
3571 | + parent_lkb = find_lock_by_id(lspace, parent); | |
3572 | + | |
3573 | + if (!parent_lkb || | |
3574 | + parent_lkb->lkb_flags & GDLM_LKFLG_DELETED || | |
3575 | + parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY || | |
3576 | + parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) { | |
3577 | + up_read(&lspace->ls_unlock_sem); | |
3578 | + goto out; | |
3579 | + } | |
3580 | + | |
3581 | + atomic_inc(&parent_lkb->lkb_childcnt); | |
3582 | + up_read(&lspace->ls_unlock_sem); | |
3583 | + } | |
3584 | + | |
3585 | + down_read(&lspace->ls_in_recovery); | |
3586 | + | |
3587 | + ret = -ENOMEM; | |
3588 | + | |
3589 | + lkb = create_lkb(lspace); | |
3590 | + if (!lkb) | |
3591 | + goto fail_dec; | |
3592 | + lkb->lkb_astaddr = ast; | |
3593 | + lkb->lkb_astparam = (long) astarg; | |
3594 | + lkb->lkb_bastaddr = bast; | |
3595 | + lkb->lkb_rqmode = mode; | |
3596 | + lkb->lkb_grmode = DLM_LOCK_IV; | |
3597 | + lkb->lkb_lksb = lksb; | |
3598 | + lkb->lkb_parent = parent_lkb; | |
3599 | + lkb->lkb_lockqueue_flags = flags; | |
3600 | + lkb->lkb_lvbptr = lksb->sb_lvbptr; | |
3601 | + | |
3602 | + /* Copy the range if appropriate */ | |
3603 | + if (range) { | |
3604 | + if (range->ra_start > range->ra_end) { | |
3605 | + ret = -EINVAL; | |
3606 | + goto fail_free; | |
3607 | + } | |
3608 | + | |
3609 | + if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end)) | |
3610 | + goto fail_free; | |
3611 | + } | |
3612 | + | |
3613 | + /* Convert relevant flags to internal numbers */ | |
3614 | + if (flags & DLM_LKF_VALBLK) | |
3615 | + lkb->lkb_flags |= GDLM_LKFLG_VALBLK; | |
3616 | + if (flags & DLM_LKF_PERSISTENT) | |
3617 | + lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT; | |
3618 | + if (flags & DLM_LKF_NODLCKWT) | |
3619 | + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT; | |
3620 | + | |
3621 | + lksb->sb_lkid = lkb->lkb_id; | |
3622 | + | |
3623 | + ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen); | |
3624 | + if (ret) | |
3625 | + goto fail_free; | |
3626 | + | |
3627 | + up_read(&lspace->ls_in_recovery); | |
3628 | + | |
3629 | + wake_astd(); | |
3630 | + | |
3631 | + return 0; | |
3632 | + | |
3633 | + fail_free: | |
3634 | + release_lkb(lspace, lkb); | |
3635 | + goto fail_unlock; | |
3636 | + | |
3637 | + fail_dec: | |
3638 | + if (parent_lkb) | |
3639 | + atomic_dec(&parent_lkb->lkb_childcnt); | |
3640 | + | |
3641 | + fail_unlock: | |
3642 | + up_read(&lspace->ls_in_recovery); | |
3643 | + | |
3644 | + out: | |
3645 | + return ret; | |
3646 | +} | |
3647 | + | |
3648 | +int dlm_lock_stage1(gd_ls_t *ls, gd_lkb_t *lkb, int flags, char *name, | |
3649 | + int namelen) | |
3650 | +{ | |
3651 | + gd_res_t *rsb, *parent_rsb = NULL; | |
3652 | + gd_lkb_t *parent_lkb = lkb->lkb_parent; | |
3653 | + gd_resdata_t *rd; | |
3654 | + uint32_t nodeid; | |
3655 | + int error; | |
3656 | + | |
3657 | + if (parent_lkb) | |
3658 | + parent_rsb = parent_lkb->lkb_resource; | |
3659 | + | |
3660 | + error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb); | |
3661 | + if (error) | |
3662 | + goto out; | |
3663 | + | |
3664 | + lkb->lkb_resource = rsb; | |
3665 | + lkb->lkb_nodeid = rsb->res_nodeid; | |
3666 | + | |
3667 | + /* | |
3668 | + * Next stage, do we need to find the master or can | |
3669 | + * we get on with the real locking work ? | |
3670 | + */ | |
3671 | + | |
3672 | + if (rsb->res_nodeid == -1) { | |
3673 | + if (get_directory_nodeid(rsb) != our_nodeid()) { | |
3674 | + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB); | |
3675 | + goto out; | |
3676 | + } | |
3677 | + | |
3678 | + error = get_resdata(ls, our_nodeid(), rsb->res_name, | |
3679 | + rsb->res_length, &rd, 0); | |
3680 | + if (error) | |
3681 | + goto out; | |
3682 | + | |
3683 | + nodeid = rd->rd_master_nodeid; | |
3684 | + if (nodeid == our_nodeid()) | |
3685 | + nodeid = 0; | |
3686 | + rsb->res_nodeid = nodeid; | |
3687 | + lkb->lkb_nodeid = nodeid; | |
3688 | + rsb->res_resdir_seq = rd->rd_sequence; | |
3689 | + } | |
3690 | + | |
3691 | + error = dlm_lock_stage2(ls, lkb, rsb, flags); | |
3692 | + | |
3693 | + out: | |
3694 | + if (error) | |
3695 | + release_rsb(rsb); | |
3696 | + | |
3697 | + return error; | |
3698 | +} | |
3699 | + | |
3700 | +/* | |
3701 | + * Locking routine called after we have an RSB, either a copy of a remote one | |
3702 | + * or a local one, or perhaps a shiny new one all of our very own | |
3703 | + */ | |
3704 | + | |
3705 | +int dlm_lock_stage2(gd_ls_t *ls, gd_lkb_t *lkb, gd_res_t *rsb, int flags) | |
3706 | +{ | |
3707 | + int error = 0; | |
3708 | + | |
3709 | + if (rsb->res_nodeid) { | |
3710 | + res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING); | |
3711 | + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT); | |
3712 | + } else { | |
3713 | + dlm_lock_stage3(lkb); | |
3714 | + } | |
3715 | + | |
3716 | + return error; | |
3717 | +} | |
3718 | + | |
3719 | +/* | |
3720 | + * Called on an RSB's master node to do stage2 locking for a remote lock | |
3721 | + * request. Returns a proper lkb with rsb ready for lock processing. | |
3722 | + * This is analagous to sections of dlm_lock() and dlm_lock_stage1(). | |
3723 | + */ | |
3724 | + | |
3725 | +gd_lkb_t *remote_stage2(int remote_nodeid, gd_ls_t *ls, | |
3726 | + struct gd_remlockrequest *freq) | |
3727 | +{ | |
3728 | + gd_res_t *rsb = NULL, *parent_rsb = NULL; | |
3729 | + gd_lkb_t *lkb = NULL, *parent_lkb = NULL; | |
3730 | + int error, namelen; | |
3731 | + | |
3732 | + if (freq->rr_remparid) { | |
3733 | + parent_lkb = find_lock_by_id(ls, freq->rr_remparid); | |
3734 | + if (!parent_lkb) | |
3735 | + goto fail; | |
3736 | + | |
3737 | + atomic_inc(&parent_lkb->lkb_childcnt); | |
3738 | + parent_rsb = parent_lkb->lkb_resource; | |
3739 | + } | |
3740 | + | |
3741 | + /* | |
3742 | + * A new MSTCPY lkb. Initialize lkb fields including the real lkid and | |
3743 | + * node actually holding the (non-MSTCPY) lkb. AST address are just | |
3744 | + * flags in the master copy. | |
3745 | + */ | |
3746 | + | |
3747 | + lkb = create_lkb(ls); | |
3748 | + if (!lkb) | |
3749 | + goto fail_dec; | |
3750 | + lkb->lkb_grmode = DLM_LOCK_IV; | |
3751 | + lkb->lkb_rqmode = freq->rr_rqmode; | |
3752 | + lkb->lkb_parent = parent_lkb; | |
5cdbd17b AM |
3753 | + lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP); |
3754 | + lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST); | |
4bf12011 | 3755 | + lkb->lkb_nodeid = remote_nodeid; |
3756 | + lkb->lkb_remid = freq->rr_header.rh_lkid; | |
3757 | + lkb->lkb_flags = GDLM_LKFLG_MSTCPY; | |
3758 | + lkb->lkb_lockqueue_flags = freq->rr_flags; | |
3759 | + | |
3760 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) { | |
3761 | + lkb->lkb_flags |= GDLM_LKFLG_VALBLK; | |
3762 | + allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb); | |
3763 | + if (!lkb->lkb_lvbptr) | |
3764 | + goto fail_free; | |
3765 | + } | |
3766 | + | |
3767 | + if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) { | |
3768 | + error = lkb_set_range(ls, lkb, freq->rr_range_start, | |
3769 | + freq->rr_range_end); | |
3770 | + if (error) | |
3771 | + goto fail_free; | |
3772 | + } | |
3773 | + | |
3774 | + /* | |
3775 | + * Get the RSB which this lock is for. Create a new RSB if this is a | |
3776 | + * new lock on a new resource. We must be the master of any new rsb. | |
3777 | + */ | |
3778 | + | |
3779 | + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1; | |
3780 | + | |
3781 | + error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 1, | |
3782 | + &rsb); | |
3783 | + if (error) | |
3784 | + goto fail_free; | |
3785 | + | |
3786 | + lkb->lkb_resource = rsb; | |
3787 | + if (rsb->res_nodeid == -1) | |
3788 | + rsb->res_nodeid = 0; | |
3789 | + if (freq->rr_resdir_seq) | |
3790 | + rsb->res_resdir_seq = freq->rr_resdir_seq; | |
3791 | + | |
3792 | + return lkb; | |
3793 | + | |
3794 | + | |
3795 | + fail_free: | |
3796 | + /* release_lkb handles parent */ | |
3797 | + release_lkb(ls, lkb); | |
3798 | + parent_lkb = NULL; | |
3799 | + | |
3800 | + fail_dec: | |
3801 | + if (parent_lkb) | |
3802 | + atomic_dec(&parent_lkb->lkb_childcnt); | |
3803 | + fail: | |
3804 | + return NULL; | |
3805 | +} | |
3806 | + | |
3807 | +/* | |
3808 | + * The final bit of lock request processing on the master node. Here the lock | |
3809 | + * is granted and the completion ast is queued, or the lock is put on the | |
3810 | + * waitqueue and blocking asts are sent. | |
3811 | + */ | |
3812 | + | |
3813 | +void dlm_lock_stage3(gd_lkb_t *lkb) | |
3814 | +{ | |
3815 | + gd_res_t *rsb = lkb->lkb_resource; | |
3816 | + | |
3817 | + /* | |
3818 | + * This is a locally mastered lock on a resource that already exists, | |
3819 | + * see if it can be granted or if it must wait. When this function is | |
3820 | + * called for a remote lock request (process_cluster_request, | |
3821 | + * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the | |
3822 | + * requesting node at the end of process_cluster_request, not at the | |
3823 | + * end of grant_lock. | |
3824 | + */ | |
3825 | + | |
3826 | + down_write(&rsb->res_lock); | |
3827 | + | |
3828 | + if (can_be_granted(rsb, lkb)) { | |
3829 | + grant_lock(lkb, 0); | |
3830 | + goto out; | |
3831 | + } | |
3832 | + | |
3833 | + /* | |
3834 | + * This request is not a conversion, so the lkb didn't exist other than | |
3835 | + * for this request and should be freed after EAGAIN is returned in the | |
3836 | + * ast. | |
3837 | + */ | |
3838 | + | |
3839 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) { | |
4bf12011 | 3840 | + lkb->lkb_retstatus = -EAGAIN; |
4bf12011 | 3841 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST) |
3842 | + send_blocking_asts_all(rsb, lkb); | |
5cdbd17b | 3843 | + queue_ast(lkb, AST_COMP | AST_DEL, 0); |
4bf12011 | 3844 | + goto out; |
3845 | + } | |
3846 | + | |
3847 | + /* | |
3848 | + * The requested lkb must wait. Because the rsb of the requested lkb | |
3849 | + * is mastered here, send blocking asts for the lkb's blocking the | |
3850 | + * request. | |
3851 | + */ | |
3852 | + | |
3853 | + lkb->lkb_retstatus = 0; | |
3854 | + lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING); | |
3855 | + | |
3856 | + send_blocking_asts(rsb, lkb); | |
3857 | + | |
3858 | + out: | |
3859 | + up_write(&rsb->res_lock); | |
3860 | +} | |
3861 | + | |
3862 | +int dlm_unlock(void *lockspace, | |
3863 | + uint32_t lkid, | |
3864 | + uint32_t flags, | |
3865 | + struct dlm_lksb *lksb, | |
3866 | + void *astarg) | |
3867 | +{ | |
3868 | + gd_ls_t *ls = find_lockspace_by_local_id(lockspace); | |
3869 | + gd_lkb_t *lkb; | |
3870 | + gd_res_t *rsb; | |
3871 | + int ret = -EINVAL; | |
3872 | + | |
3873 | + if (!ls) | |
3874 | + goto out; | |
3875 | + | |
3876 | + lkb = find_lock_by_id(ls, lkid); | |
3877 | + if (!lkb) | |
3878 | + goto out; | |
3879 | + | |
3880 | + /* Can't dequeue a master copy (a remote node's mastered lock) */ | |
3881 | + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) | |
3882 | + goto out; | |
3883 | + | |
3884 | + /* Already waiting for a remote lock operation */ | |
3885 | + if (lkb->lkb_lockqueue_state) { | |
3886 | + ret = -EBUSY; | |
3887 | + goto out; | |
3888 | + } | |
3889 | + | |
3890 | + /* Can only cancel WAITING or CONVERTing locks. | |
3891 | + * This is just a quick check - it is also checked in unlock_stage2() | |
3892 | + * (which may be on the master) under the semaphore. | |
3893 | + */ | |
3894 | + if ((flags & DLM_LKF_CANCEL) && | |
3895 | + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) | |
3896 | + goto out; | |
3897 | + | |
3898 | + /* "Normal" unlocks must operate on a granted lock */ | |
3899 | + if (!(flags & DLM_LKF_CANCEL) && | |
3900 | + (lkb->lkb_status != GDLM_LKSTS_GRANTED)) | |
3901 | + goto out; | |
3902 | + | |
3903 | + down_write(&ls->ls_unlock_sem); | |
3904 | + | |
3905 | + /* Can't dequeue a lock with sublocks */ | |
3906 | + if (atomic_read(&lkb->lkb_childcnt)) { | |
3907 | + up_write(&ls->ls_unlock_sem); | |
3908 | + ret = -ENOTEMPTY; | |
3909 | + goto out; | |
3910 | + } | |
3911 | + | |
3912 | + /* Mark it as deleted so we can't use it as a parent in dlm_lock() */ | |
3913 | + if (!(flags & DLM_LKF_CANCEL)) | |
3914 | + lkb->lkb_flags |= GDLM_LKFLG_DELETED; | |
3915 | + up_write(&ls->ls_unlock_sem); | |
3916 | + | |
3917 | + /* Save any new params */ | |
3918 | + if (lksb) | |
3919 | + lkb->lkb_lksb = lksb; | |
3920 | + if (astarg) | |
3921 | + lkb->lkb_astparam = (long) astarg; | |
3922 | + | |
3923 | + lkb->lkb_lockqueue_flags = flags; | |
3924 | + | |
3925 | + rsb = lkb->lkb_resource; | |
3926 | + | |
3927 | + down_read(&ls->ls_in_recovery); | |
3928 | + | |
3929 | + if (rsb->res_nodeid) | |
3930 | + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK); | |
3931 | + else | |
3932 | + ret = dlm_unlock_stage2(lkb, flags); | |
3933 | + | |
3934 | + up_read(&ls->ls_in_recovery); | |
3935 | + | |
3936 | + wake_astd(); | |
3937 | + | |
3938 | + out: | |
3939 | + return ret; | |
3940 | +} | |
3941 | + | |
3942 | +int dlm_unlock_stage2(gd_lkb_t *lkb, uint32_t flags) | |
3943 | +{ | |
3944 | + gd_res_t *rsb = lkb->lkb_resource; | |
3945 | + int old_status; | |
3946 | + int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY; | |
3947 | + | |
3948 | + down_write(&rsb->res_lock); | |
3949 | + | |
3950 | + /* Can only cancel WAITING or CONVERTing locks */ | |
3951 | + if ((flags & DLM_LKF_CANCEL) && | |
3952 | + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) { | |
3953 | + lkb->lkb_retstatus = -EINVAL; | |
5cdbd17b | 3954 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 3955 | + goto out; |
3956 | + } | |
3957 | + | |
3958 | + old_status = lkb_dequeue(lkb); | |
3959 | + | |
3960 | + /* | |
3961 | + * If was granted grant any converting or waiting locks. | |
3962 | + */ | |
3963 | + | |
3964 | + if (old_status == GDLM_LKSTS_GRANTED) | |
3965 | + grant_pending_locks(rsb); | |
3966 | + | |
3967 | + /* | |
3968 | + * Cancelling a conversion | |
3969 | + */ | |
3970 | + | |
3971 | + if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) { | |
3972 | + /* VMS semantics say we should send blocking ASTs again here */ | |
3973 | + send_blocking_asts(rsb, lkb); | |
3974 | + | |
3975 | + /* Remove from deadlock detection */ | |
3976 | + if (lkb->lkb_duetime) | |
3977 | + remove_from_deadlockqueue(lkb); | |
3978 | + | |
3979 | + /* Stick it back on the granted queue */ | |
3980 | + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED); | |
3981 | + lkb->lkb_rqmode = lkb->lkb_grmode; | |
3982 | + | |
3983 | + /* Was it blocking any other locks? */ | |
3984 | + if (first_in_list(lkb, &rsb->res_convertqueue)) | |
3985 | + grant_pending_locks(rsb); | |
3986 | + | |
3987 | + lkb->lkb_retstatus = -DLM_ECANCEL; | |
5cdbd17b | 3988 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 3989 | + goto out; |
3990 | + } | |
3991 | + | |
3992 | + /* | |
3993 | + * The lvb can be saved or cleared on unlock. | |
3994 | + */ | |
3995 | + | |
3996 | + if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) { | |
3997 | + if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr) | |
3998 | + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); | |
3999 | + if (flags & DLM_LKF_IVVALBLK) | |
4000 | + memset(rsb->res_lvbptr, 0, DLM_LVB_LEN); | |
4001 | + } | |
4002 | + | |
5cdbd17b AM |
4003 | + lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK; |
4004 | + queue_ast(lkb, AST_COMP | AST_DEL, 0); | |
4bf12011 | 4005 | + |
4006 | + /* | |
4007 | + * Only free the LKB if we are the master copy. Otherwise the AST | |
4008 | + * delivery routine will free it after delivery. queue_ast for MSTCPY | |
4009 | + * lkb just sends a message. | |
4010 | + */ | |
4011 | + | |
4012 | + if (remote) { | |
4013 | + up_write(&rsb->res_lock); | |
4014 | + release_lkb(rsb->res_ls, lkb); | |
4015 | + release_rsb(rsb); | |
4016 | + goto out2; | |
4017 | + } | |
4018 | + | |
4019 | + out: | |
4020 | + up_write(&rsb->res_lock); | |
4021 | + out2: | |
4022 | + wake_astd(); | |
4023 | + return 0; | |
4024 | +} | |
4025 | + | |
4026 | +/* | |
4027 | + * Lock conversion | |
4028 | + */ | |
4029 | + | |
4030 | +static int convert_lock(gd_ls_t *ls, int mode, struct dlm_lksb *lksb, | |
4031 | + int flags, void *ast, void *astarg, void *bast, | |
4032 | + struct dlm_range *range) | |
4033 | +{ | |
4034 | + gd_lkb_t *lkb; | |
4035 | + gd_res_t *rsb; | |
4036 | + int ret = -EINVAL; | |
4037 | + | |
4038 | + lkb = find_lock_by_id(ls, lksb->sb_lkid); | |
4039 | + if (!lkb) { | |
4040 | + goto out; | |
4041 | + } | |
4042 | + | |
4043 | + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) { | |
4044 | + ret = -EBUSY; | |
4045 | + goto out; | |
4046 | + } | |
4047 | + | |
4048 | + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) { | |
4049 | + goto out; | |
4050 | + } | |
4051 | + | |
4052 | + if ((flags & DLM_LKF_QUECVT) && | |
4053 | + !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) { | |
4054 | + goto out; | |
4055 | + } | |
4056 | + | |
4057 | + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) { | |
4058 | + goto out; | |
4059 | + } | |
4060 | + | |
4061 | + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) { | |
4062 | + goto out; | |
4063 | + } | |
4064 | + | |
4065 | + /* Set up the ranges as appropriate */ | |
4066 | + if (range) { | |
4067 | + if (range->ra_start > range->ra_end) | |
4068 | + goto out; | |
4069 | + | |
4070 | + if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) { | |
4071 | + ret = -ENOMEM; | |
4072 | + goto out; | |
4073 | + } | |
4074 | + } | |
4075 | + | |
4076 | + rsb = lkb->lkb_resource; | |
4077 | + down_read(&rsb->res_ls->ls_in_recovery); | |
4078 | + | |
4079 | + lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK; | |
4080 | + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED; | |
4081 | + | |
4082 | + if (flags & DLM_LKF_NODLCKWT) | |
4083 | + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT; | |
4084 | + if (ast) | |
4085 | + lkb->lkb_astaddr = ast; | |
4086 | + if (astarg) | |
4087 | + lkb->lkb_astparam = (long) astarg; | |
4088 | + if (bast) | |
4089 | + lkb->lkb_bastaddr = bast; | |
4090 | + lkb->lkb_rqmode = mode; | |
4091 | + lkb->lkb_lockqueue_flags = flags; | |
4092 | + lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0; | |
4093 | + lkb->lkb_lvbptr = lksb->sb_lvbptr; | |
4094 | + | |
4095 | + if (rsb->res_nodeid) { | |
4096 | + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT); | |
4097 | + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT); | |
4098 | + } else { | |
4099 | + ret = dlm_convert_stage2(lkb, FALSE); | |
4100 | + } | |
4101 | + | |
4102 | + up_read(&rsb->res_ls->ls_in_recovery); | |
4103 | + | |
4104 | + wake_astd(); | |
4105 | + | |
4106 | + out: | |
4107 | + return ret; | |
4108 | +} | |
4109 | + | |
4110 | +/* | |
4111 | + * For local conversion requests on locally mastered locks this is called | |
4112 | + * directly from dlm_lock/convert_lock. This function is also called for | |
4113 | + * remote conversion requests of MSTCPY locks (from process_cluster_request). | |
4114 | + */ | |
4115 | + | |
4116 | +int dlm_convert_stage2(gd_lkb_t *lkb, int do_ast) | |
4117 | +{ | |
4118 | + gd_res_t *rsb = lkb->lkb_resource; | |
4119 | + int ret = 0; | |
4120 | + | |
4121 | + down_write(&rsb->res_lock); | |
4122 | + | |
4123 | + if (can_be_granted(rsb, lkb)) { | |
4124 | + grant_lock(lkb, 0); | |
4125 | + grant_pending_locks(rsb); | |
4126 | + goto out; | |
4127 | + } | |
4128 | + | |
4129 | + /* | |
4130 | + * Remove lkb from granted queue. | |
4131 | + */ | |
4132 | + | |
4133 | + lkb_dequeue(lkb); | |
4134 | + | |
4135 | + /* | |
4136 | + * The user won't wait so stick it back on the grant queue | |
4137 | + */ | |
4138 | + | |
4139 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) { | |
4140 | + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED); | |
4141 | + ret = lkb->lkb_retstatus = -EAGAIN; | |
4142 | + if (do_ast) | |
5cdbd17b | 4143 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 4144 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST) |
4145 | + send_blocking_asts_all(rsb, lkb); | |
4146 | + goto out; | |
4147 | + } | |
4148 | + | |
4149 | + /* | |
4150 | + * The lkb's status tells which queue it's on. Put back on convert | |
4151 | + * queue. (QUECVT requests added at end of the queue, all others in | |
4152 | + * order.) | |
4153 | + */ | |
4154 | + | |
4155 | + lkb->lkb_retstatus = 0; | |
4156 | + lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT); | |
4157 | + | |
4158 | + /* | |
4159 | + * If the request can't be granted | |
4160 | + */ | |
4161 | + | |
4162 | + send_blocking_asts(rsb, lkb); | |
4163 | + | |
4164 | + if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT)) | |
4165 | + add_to_deadlockqueue(lkb); | |
4166 | + | |
4167 | + out: | |
4168 | + up_write(&rsb->res_lock); | |
4169 | + return ret; | |
4170 | +} | |
4171 | + | |
4172 | +/* | |
4173 | + * Remove lkb from any queue it's on, add it to the granted queue, and queue a | |
4174 | + * completion ast. rsb res_lock must be held in write when this is called. | |
4175 | + */ | |
4176 | + | |
4177 | +static void grant_lock(gd_lkb_t *lkb, int send_remote) | |
4178 | +{ | |
4179 | + gd_res_t *rsb = lkb->lkb_resource; | |
4180 | + | |
4181 | + if (lkb->lkb_duetime) | |
4182 | + remove_from_deadlockqueue(lkb); | |
4183 | + | |
4184 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { | |
4185 | + int b; | |
4186 | + GDLM_ASSERT(lkb->lkb_lvbptr,); | |
4187 | + | |
4188 | + if (!rsb->res_lvbptr) | |
4189 | + rsb->res_lvbptr = allocate_lvb(rsb->res_ls); | |
4190 | + | |
4191 | + b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; | |
4192 | + if (b) | |
4193 | + memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN); | |
4194 | + else | |
4195 | + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); | |
4196 | + } | |
4197 | + | |
4198 | + if (lkb->lkb_range) { | |
4199 | + lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START]; | |
4200 | + lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END]; | |
4201 | + } | |
4202 | + | |
4203 | + lkb->lkb_grmode = lkb->lkb_rqmode; | |
4204 | + lkb->lkb_rqmode = DLM_LOCK_IV; | |
4205 | + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); | |
4206 | + | |
4207 | + lkb->lkb_highbast = 0; | |
4208 | + lkb->lkb_retstatus = 0; | |
5cdbd17b | 4209 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 4210 | + |
4211 | + /* | |
4212 | + * A remote conversion request has been granted, either immediately | |
4213 | + * upon being requested or after waiting a bit. In the former case, | |
4214 | + * reply_and_grant() is called. In the later case send_remote is 1 and | |
4215 | + * remote_grant() is called. | |
4216 | + * | |
4217 | + * The "send_remote" flag is set only for locks which are granted "out | |
4218 | + * of band" - ie by another lock being converted or unlocked. | |
4219 | + * | |
4220 | + * The second case occurs when this lkb is granted right away as part | |
4221 | + * of processing the initial request. In that case, we send a single | |
4222 | + * message in reply_and_grant which combines the request reply with the | |
4223 | + * grant message. | |
4224 | + */ | |
4225 | + | |
4226 | + if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) { | |
4227 | + if (send_remote) | |
4228 | + remote_grant(lkb); | |
4229 | + else if (lkb->lkb_request) | |
4230 | + reply_and_grant(lkb); | |
4231 | + } | |
4232 | + | |
4233 | +} | |
4234 | + | |
4235 | +static void send_bast_queue(struct list_head *head, gd_lkb_t *lkb) | |
4236 | +{ | |
4237 | + gd_lkb_t *gr; | |
4238 | + | |
4239 | + list_for_each_entry(gr, head, lkb_statequeue) { | |
4240 | + if (gr->lkb_bastaddr && | |
4241 | + gr->lkb_highbast < lkb->lkb_rqmode && | |
4242 | + ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) { | |
5cdbd17b | 4243 | + queue_ast(gr, AST_BAST, lkb->lkb_rqmode); |
4bf12011 | 4244 | + gr->lkb_highbast = lkb->lkb_rqmode; |
4245 | + } | |
4246 | + } | |
4247 | +} | |
4248 | + | |
4249 | +/* | |
4250 | + * Notify granted locks if they are blocking a newly forced-to-wait lock. | |
4251 | + */ | |
4252 | + | |
4253 | +static void send_blocking_asts(gd_res_t *rsb, gd_lkb_t *lkb) | |
4254 | +{ | |
4255 | + send_bast_queue(&rsb->res_grantqueue, lkb); | |
4256 | + /* check if the following improves performance */ | |
4257 | + /* send_bast_queue(&rsb->res_convertqueue, lkb); */ | |
4258 | +} | |
4259 | + | |
4260 | +static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb) | |
4261 | +{ | |
4262 | + send_bast_queue(&rsb->res_grantqueue, lkb); | |
4263 | + send_bast_queue(&rsb->res_convertqueue, lkb); | |
4264 | +} | |
4265 | + | |
4266 | +/* | |
4267 | + * Called when a lock has been dequeued. Look for any locks to grant that are | |
4268 | + * waiting for conversion or waiting to be granted. | |
4269 | + * The rsb res_lock must be held in write when this function is called. | |
4270 | + */ | |
4271 | + | |
4272 | +int grant_pending_locks(gd_res_t *rsb) | |
4273 | +{ | |
4274 | + gd_lkb_t *lkb; | |
4275 | + struct list_head *list; | |
4276 | + struct list_head *temp; | |
4277 | + int8_t high = DLM_LOCK_IV; | |
4278 | + | |
4279 | + list_for_each_safe(list, temp, &rsb->res_convertqueue) { | |
4280 | + lkb = list_entry(list, gd_lkb_t, lkb_statequeue); | |
4281 | + | |
4282 | + if (can_be_granted(rsb, lkb)) | |
4283 | + grant_lock(lkb, 1); | |
4284 | + else | |
4285 | + high = MAX(lkb->lkb_rqmode, high); | |
4286 | + } | |
4287 | + | |
4288 | + list_for_each_safe(list, temp, &rsb->res_waitqueue) { | |
4289 | + lkb = list_entry(list, gd_lkb_t, lkb_statequeue); | |
4290 | + | |
4291 | + if (can_be_granted(rsb, lkb)) | |
4292 | + grant_lock(lkb, 1); | |
4293 | + else | |
4294 | + high = MAX(lkb->lkb_rqmode, high); | |
4295 | + } | |
4296 | + | |
4297 | + /* | |
4298 | + * If there are locks left on the wait/convert queue then send blocking | |
4299 | + * ASTs to granted locks that are blocking | |
4300 | + * | |
4301 | + * FIXME: This might generate some spurious blocking ASTs for range | |
4302 | + * locks. | |
4303 | + */ | |
4304 | + | |
4305 | + if (high > DLM_LOCK_IV) { | |
4306 | + list_for_each_safe(list, temp, &rsb->res_grantqueue) { | |
4307 | + lkb = list_entry(list, gd_lkb_t, lkb_statequeue); | |
4308 | + | |
4309 | + if (lkb->lkb_bastaddr && | |
4310 | + (lkb->lkb_highbast < high) && | |
4311 | + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) { | |
4312 | + | |
5cdbd17b | 4313 | + queue_ast(lkb, AST_BAST, high); |
4bf12011 | 4314 | + lkb->lkb_highbast = high; |
4315 | + } | |
4316 | + } | |
4317 | + } | |
4318 | + | |
4319 | + return 0; | |
4320 | +} | |
4321 | + | |
4322 | +/* | |
4323 | + * Called to cancel a locking operation that failed due to some internal | |
4324 | + * reason. | |
4325 | + * | |
4326 | + * Waiting locks will be removed, converting locks will be reverted to their | |
4327 | + * granted status, unlocks will be left where they are. | |
4328 | + * | |
4329 | + * A completion AST will be delivered to the caller. | |
4330 | + */ | |
4331 | + | |
4332 | +int cancel_lockop(gd_lkb_t *lkb, int status) | |
4333 | +{ | |
4334 | + int state = lkb->lkb_lockqueue_state; | |
5cdbd17b | 4335 | + uint16_t astflags = AST_COMP; |
4bf12011 | 4336 | + |
4337 | + lkb->lkb_lockqueue_state = 0; | |
4338 | + | |
4339 | + switch (state) { | |
4340 | + case GDLM_LQSTATE_WAIT_RSB: | |
5cdbd17b | 4341 | + astflags |= AST_DEL; |
4bf12011 | 4342 | + break; |
4343 | + | |
4344 | + case GDLM_LQSTATE_WAIT_CONDGRANT: | |
4345 | + res_lkb_dequeue(lkb); | |
5cdbd17b | 4346 | + astflags |= AST_DEL; |
4bf12011 | 4347 | + break; |
4348 | + | |
4349 | + case GDLM_LQSTATE_WAIT_CONVERT: | |
4350 | + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED); | |
4351 | + | |
4352 | + /* Remove from deadlock detection */ | |
4353 | + if (lkb->lkb_duetime) { | |
4354 | + remove_from_deadlockqueue(lkb); | |
4355 | + } | |
4356 | + break; | |
4357 | + | |
4358 | + case GDLM_LQSTATE_WAIT_UNLOCK: | |
4359 | + /* We can leave this. I think.... */ | |
4360 | + break; | |
4361 | + } | |
4362 | + | |
4363 | + lkb->lkb_retstatus = status; | |
5cdbd17b | 4364 | + queue_ast(lkb, astflags, 0); |
4bf12011 | 4365 | + |
4366 | + return 0; | |
4367 | +} | |
4368 | + | |
4369 | +/* | |
4370 | + * Check for conversion deadlock. If a deadlock was found | |
4371 | + * return lkb to kill, else return NULL | |
4372 | + */ | |
4373 | + | |
4374 | +gd_lkb_t *conversion_deadlock_check(gd_lkb_t *lkb) | |
4375 | +{ | |
4376 | + gd_res_t *rsb = lkb->lkb_resource; | |
4377 | + struct list_head *entry; | |
4378 | + | |
4379 | + GDLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,); | |
4380 | + | |
4381 | + /* Work our way up to the head of the queue looking for locks that | |
4382 | + * conflict with us */ | |
4383 | + | |
4384 | + down_read(&rsb->res_lock); | |
4385 | + | |
4386 | + entry = lkb->lkb_statequeue.prev; | |
4387 | + while (entry != &rsb->res_convertqueue) { | |
4388 | + gd_lkb_t *lkb2 = list_entry(entry, gd_lkb_t, lkb_statequeue); | |
4389 | + | |
4390 | + if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) { | |
4391 | + up_read(&rsb->res_lock); | |
4392 | + return lkb; | |
4393 | + } | |
4394 | + entry = entry->prev; | |
4395 | + } | |
4396 | + up_read(&rsb->res_lock); | |
4397 | + | |
4398 | + return 0; | |
4399 | +} | |
4400 | + | |
4401 | +/* | |
4402 | + * Conversion operation was cancelled by us (not the user). | |
4403 | + * ret contains the return code to pass onto the user | |
4404 | + */ | |
4405 | + | |
4406 | +void cancel_conversion(gd_lkb_t *lkb, int ret) | |
4407 | +{ | |
4408 | + gd_res_t *rsb = lkb->lkb_resource; | |
4409 | + | |
4410 | + /* Stick it back on the granted queue */ | |
4411 | + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); | |
4412 | + lkb->lkb_rqmode = lkb->lkb_grmode; | |
4413 | + | |
4414 | + remove_from_deadlockqueue(lkb); | |
4415 | + | |
4416 | + lkb->lkb_retstatus = ret; | |
5cdbd17b | 4417 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 4418 | + wake_astd(); |
4419 | +} | |
4420 | + | |
4421 | +/* | |
4422 | + * As new master of the rsb for this lkb, we need to handle these requests | |
4423 | + * removed from the lockqueue and originating from local processes: | |
4424 | + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT, | |
4425 | + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT. | |
4426 | + */ | |
4427 | + | |
4428 | +void process_remastered_lkb(gd_lkb_t *lkb, int state) | |
4429 | +{ | |
4430 | + switch (state) { | |
4431 | + case GDLM_LQSTATE_WAIT_RSB: | |
4432 | + dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb, | |
4433 | + lkb->lkb_lockqueue_flags, | |
4434 | + lkb->lkb_resource->res_name, | |
4435 | + lkb->lkb_resource->res_length); | |
4436 | + break; | |
4437 | + | |
4438 | + case GDLM_LQSTATE_WAIT_CONDGRANT: | |
4439 | + res_lkb_dequeue(lkb); | |
4440 | + dlm_lock_stage3(lkb); | |
4441 | + break; | |
4442 | + | |
4443 | + case GDLM_LQSTATE_WAIT_UNLOCK: | |
4444 | + dlm_unlock_stage2(lkb, lkb->lkb_lockqueue_flags); | |
4445 | + break; | |
4446 | + | |
4447 | + case GDLM_LQSTATE_WAIT_CONVERT: | |
4448 | + dlm_convert_stage2(lkb, TRUE); | |
4449 | + break; | |
4450 | + | |
4451 | + default: | |
4452 | + GDLM_ASSERT(0,); | |
4453 | + } | |
4454 | +} | |
4455 | diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h | |
4456 | --- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 4457 | +++ linux-patched/cluster/dlm/locking.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 4458 | @@ -0,0 +1,33 @@ |
4459 | +/****************************************************************************** | |
4460 | +******************************************************************************* | |
4461 | +** | |
4462 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
4463 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
4464 | +** | |
4465 | +** This copyrighted material is made available to anyone wishing to use, | |
4466 | +** modify, copy, or redistribute it subject to the terms and conditions | |
4467 | +** of the GNU General Public License v.2. | |
4468 | +** | |
4469 | +******************************************************************************* | |
4470 | +******************************************************************************/ | |
4471 | + | |
4472 | +#ifndef __LOCKING_DOT_H__ | |
4473 | +#define __LOCKING_DOT_H__ | |
4474 | + | |
4475 | +void process_remastered_lkb(gd_lkb_t * lkb, int state); | |
4476 | +void dlm_lock_stage3(gd_lkb_t * lkb); | |
4477 | +int dlm_convert_stage2(gd_lkb_t * lkb, int do_ast); | |
4478 | +int dlm_unlock_stage2(gd_lkb_t * lkb, uint32_t flags); | |
4479 | +int dlm_lock_stage2(gd_ls_t * lspace, gd_lkb_t * lkb, gd_res_t * rsb, | |
4480 | + int flags); | |
4481 | +gd_res_t *create_rsb(gd_ls_t * lspace, gd_lkb_t * lkb, char *name, int namelen); | |
4482 | +int free_rsb_if_unused(gd_res_t * rsb); | |
4483 | +gd_lkb_t *remote_stage2(int remote_csid, gd_ls_t * lspace, | |
4484 | + struct gd_remlockrequest *freq); | |
4485 | +int cancel_lockop(gd_lkb_t * lkb, int status); | |
4486 | +int dlm_remove_lock(gd_lkb_t * lkb, uint32_t flags); | |
4487 | +int grant_pending_locks(gd_res_t * rsb); | |
4488 | +void cancel_conversion(gd_lkb_t * lkb, int ret); | |
4489 | +gd_lkb_t *conversion_deadlock_check(gd_lkb_t * lkb); | |
4490 | + | |
4491 | +#endif /* __LOCKING_DOT_H__ */ | |
4492 | diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c | |
4493 | --- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b AM |
4494 | +++ linux-patched/cluster/dlm/lockqueue.c 2004-06-29 20:01:20.000000000 +0800 |
4495 | @@ -0,0 +1,957 @@ | |
4bf12011 | 4496 | +/****************************************************************************** |
4497 | +******************************************************************************* | |
4498 | +** | |
4499 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
4500 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
4501 | +** | |
4502 | +** This copyrighted material is made available to anyone wishing to use, | |
4503 | +** modify, copy, or redistribute it subject to the terms and conditions | |
4504 | +** of the GNU General Public License v.2. | |
4505 | +** | |
4506 | +******************************************************************************* | |
4507 | +******************************************************************************/ | |
4508 | + | |
4509 | +/* | |
4510 | + * lockqueue.c | |
4511 | + * | |
4512 | + * This controls the lock queue, which is where locks | |
4513 | + * come when they need to wait for a remote operation | |
4514 | + * to complete. | |
4515 | + * | |
4516 | + * This could also be thought of as the "high-level" comms | |
4517 | + * layer. | |
4518 | + * | |
4519 | + */ | |
4520 | + | |
4521 | +#include "dlm_internal.h" | |
4522 | +#include "lockqueue.h" | |
4523 | +#include "dir.h" | |
4524 | +#include "locking.h" | |
4525 | +#include "lkb.h" | |
4526 | +#include "lowcomms.h" | |
4527 | +#include "midcomms.h" | |
4528 | +#include "reccomms.h" | |
4529 | +#include "nodes.h" | |
4530 | +#include "lockspace.h" | |
4531 | +#include "ast.h" | |
4532 | +#include "memory.h" | |
4533 | +#include "rsb.h" | |
4534 | +#include "queries.h" | |
4535 | + | |
4536 | +static void add_reply_lvb(gd_lkb_t * lkb, struct gd_remlockreply *reply); | |
4537 | +static void add_request_lvb(gd_lkb_t * lkb, struct gd_remlockrequest *req); | |
4538 | + | |
4539 | +/* | |
4540 | + * format of an entry on the request queue | |
4541 | + */ | |
4542 | +struct rq_entry { | |
4543 | + struct list_head rqe_list; | |
4544 | + uint32_t rqe_nodeid; | |
4545 | + char rqe_request[1]; | |
4546 | +}; | |
4547 | + | |
4548 | +/* | |
4549 | + * Add a new request (if appropriate) to the request queue and send the remote | |
4550 | + * request out. - runs in the context of the locking caller | |
4551 | + * | |
4552 | + * Recovery of a remote_stage request if the remote end fails while the lkb | |
4553 | + * is still on the lockqueue: | |
4554 | + * | |
4555 | + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in | |
4556 | + * lockqueue_lkb_mark() at the start of recovery. | |
4557 | + * | |
4558 | + * o Some lkb's will be rebuilt on new master rsb's during recovery. | |
4559 | + * (depends on the type of request, see below). | |
4560 | + * | |
4561 | + * o At the end of recovery, resend_cluster_requests() looks at these | |
4562 | + * LQRESEND lkb's and either: | |
4563 | + * | |
4564 | + * i) resends the request to the new master for the rsb where the | |
4565 | + * request is processed as usual. The lkb remains on the lockqueue until | |
4566 | + * the new master replies and we run process_lockqueue_reply(). | |
4567 | + * | |
4568 | + * ii) if we've become the rsb master, remove the lkb from the lockqueue | |
4569 | + * and processes the request locally via process_remastered_lkb(). | |
4570 | + * | |
4571 | + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue | |
4572 | + * and the request should be resent if dest node is failed. | |
4573 | + * | |
4574 | + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's | |
4575 | + * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag | |
4576 | + * makes send_lkb_queue() skip it). Resend this request to the new master. | |
4577 | + * | |
4578 | + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will | |
4579 | + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue). | |
4580 | + * Resend this request to the new master. | |
4581 | + * | |
4582 | + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue. | |
4583 | + * It will be rebuilt on the new master rsb's granted queue. Resend this | |
4584 | + * request to the new master. | |
4585 | + */ | |
4586 | + | |
4587 | +int remote_stage(gd_lkb_t *lkb, int state) | |
4588 | +{ | |
4589 | + int error; | |
4590 | + | |
4591 | + lkb->lkb_lockqueue_state = state; | |
4592 | + add_to_lockqueue(lkb); | |
4593 | + | |
4594 | + error = send_cluster_request(lkb, state); | |
4595 | + if (error < 0) { | |
4596 | + log_print("remote_stage error sending request %d", error); | |
4597 | + | |
4598 | + /* Leave on lockqueue, it will be resent to correct node during | |
4599 | + * recovery. */ | |
4600 | + | |
4601 | + /* | |
4602 | + lkb->lkb_lockqueue_state = 0; | |
4603 | + remove_from_lockqueue(lkb); | |
4604 | + return -ENOTCONN; | |
4605 | + */ | |
4606 | + } | |
4607 | + return 0; | |
4608 | +} | |
4609 | + | |
4610 | +/* | |
4611 | + * Requests received while the lockspace is in recovery get added to the | |
4612 | + * request queue and processed when recovery is complete. | |
4613 | + */ | |
4614 | + | |
4615 | +void add_to_requestqueue(gd_ls_t *ls, int nodeid, char *request, int length) | |
4616 | +{ | |
4617 | + struct rq_entry *entry; | |
4618 | + | |
4619 | + if (in_nodes_gone(ls, nodeid)) | |
4620 | + return; | |
4621 | + | |
4622 | + entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); | |
4623 | + if (!entry) { | |
4624 | + // TODO something better | |
4625 | + printk("dlm: add_to_requestqueue: out of memory\n"); | |
4626 | + return; | |
4627 | + } | |
4628 | + | |
4629 | + log_debug(ls, "add_to_requestqueue %d", nodeid); | |
4630 | + entry->rqe_nodeid = nodeid; | |
4631 | + memcpy(entry->rqe_request, request, length); | |
4632 | + list_add_tail(&entry->rqe_list, &ls->ls_requestqueue); | |
4633 | +} | |
4634 | + | |
4635 | +int process_requestqueue(gd_ls_t *ls) | |
4636 | +{ | |
4637 | + int error = 0, count = 0; | |
4638 | + struct rq_entry *entry, *safe; | |
4639 | + struct gd_req_header *req; | |
4640 | + | |
4641 | + log_all(ls, "process held requests"); | |
4642 | + | |
4643 | + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) { | |
4644 | + req = (struct gd_req_header *) entry->rqe_request; | |
4645 | + log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid); | |
4646 | + | |
4647 | + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { | |
4648 | + log_debug(ls, "process_requestqueue aborted"); | |
4649 | + error = -EINTR; | |
4650 | + break; | |
4651 | + } | |
4652 | + | |
4653 | + error = process_cluster_request(entry->rqe_nodeid, req, TRUE); | |
4654 | + if (error == -EINTR) { | |
4655 | + log_debug(ls, "process_requestqueue interrupted"); | |
4656 | + break; | |
4657 | + } | |
4658 | + | |
4659 | + list_del(&entry->rqe_list); | |
4660 | + kfree(entry); | |
4661 | + count++; | |
4662 | + error = 0; | |
4663 | + } | |
4664 | + | |
4665 | + log_all(ls, "processed %d requests", count); | |
4666 | + return error; | |
4667 | +} | |
4668 | + | |
4669 | +void wait_requestqueue(gd_ls_t *ls) | |
4670 | +{ | |
4671 | + while (!list_empty(&ls->ls_requestqueue) && | |
4672 | + test_bit(LSFL_LS_RUN, &ls->ls_flags)) | |
4673 | + schedule(); | |
4674 | +} | |
4675 | + | |
4676 | +/* | |
4677 | + * Resdir requests (lookup or remove) and replies from before recovery are | |
4678 | + * invalid since the resdir was rebuilt. Clear them. Requests from nodes now | |
4679 | + * gone are also invalid. | |
4680 | + */ | |
4681 | + | |
4682 | +void purge_requestqueue(gd_ls_t *ls) | |
4683 | +{ | |
4684 | + int count = 0; | |
4685 | + struct rq_entry *entry, *safe; | |
4686 | + struct gd_req_header *req; | |
4687 | + struct gd_remlockrequest *freq; | |
4688 | + gd_lkb_t *lkb; | |
4689 | + | |
4690 | + log_all(ls, "purge requests"); | |
4691 | + | |
4692 | + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) { | |
4693 | + req = (struct gd_req_header *) entry->rqe_request; | |
4694 | + freq = (struct gd_remlockrequest *) req; | |
4695 | + | |
4696 | + if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA || | |
4697 | + req->rh_cmd == GDLM_REMCMD_LOOKUP || | |
4698 | + in_nodes_gone(ls, entry->rqe_nodeid)) { | |
4699 | + | |
4700 | + list_del(&entry->rqe_list); | |
4701 | + kfree(entry); | |
4702 | + count++; | |
4703 | + | |
4704 | + } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) { | |
4705 | + | |
4706 | + /* | |
4707 | + * Replies to resdir lookups are invalid and must be | |
4708 | + * purged. The lookup requests are marked in | |
4709 | + * lockqueue_lkb_mark and will be resent in | |
4710 | + * resend_cluster_requests. The only way to check if | |
4711 | + * this is a lookup reply is to look at the | |
4712 | + * lockqueue_state of the lkb. | |
4713 | + */ | |
4714 | + | |
4715 | + lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid); | |
4716 | + GDLM_ASSERT(lkb,); | |
4717 | + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) { | |
4718 | + list_del(&entry->rqe_list); | |
4719 | + kfree(entry); | |
4720 | + count++; | |
4721 | + } | |
4722 | + } | |
4723 | + } | |
4724 | + | |
4725 | + log_all(ls, "purged %d requests", count); | |
4726 | +} | |
4727 | + | |
4728 | +/* | |
4729 | + * Check if there's a reply for the given lkid in the requestqueue. | |
4730 | + */ | |
4731 | + | |
4732 | +int reply_in_requestqueue(gd_ls_t *ls, int lkid) | |
4733 | +{ | |
4734 | + int rv = FALSE; | |
4735 | + struct rq_entry *entry, *safe; | |
4736 | + struct gd_req_header *req; | |
4737 | + struct gd_remlockrequest *freq; | |
4738 | + | |
4739 | + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) { | |
4740 | + req = (struct gd_req_header *) entry->rqe_request; | |
4741 | + freq = (struct gd_remlockrequest *) req; | |
4742 | + | |
4743 | + if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY && | |
4744 | + freq->rr_header.rh_lkid == lkid) { | |
4745 | + rv = TRUE; | |
4746 | + break; | |
4747 | + } | |
4748 | + } | |
4749 | + | |
4750 | + return rv; | |
4751 | +} | |
4752 | + | |
4753 | +void allocate_and_copy_lvb(gd_ls_t *ls, char **lvbptr, char *src) | |
4754 | +{ | |
4755 | + if (!*lvbptr) | |
4756 | + *lvbptr = allocate_lvb(ls); | |
4757 | + if (*lvbptr) | |
4758 | + memcpy(*lvbptr, src, DLM_LVB_LEN); | |
4759 | +} | |
4760 | + | |
4761 | +/* | |
4762 | + * Process a lockqueue LKB after it has had it's remote processing complete and | |
4763 | + * been pulled from the lockqueue. Runs in the context of the DLM recvd thread on | |
4764 | + * the machine that requested the lock. | |
4765 | + */ | |
4766 | + | |
4767 | +static void process_lockqueue_reply(gd_lkb_t *lkb, | |
4768 | + struct gd_remlockreply *reply) | |
4769 | +{ | |
4770 | + int state = lkb->lkb_lockqueue_state; | |
4771 | + int oldstate; | |
4772 | + gd_res_t *rsb = lkb->lkb_resource; | |
4773 | + gd_ls_t *ls = rsb->res_ls; | |
4774 | + | |
4775 | + lkb->lkb_lockqueue_state = 0; | |
4776 | + if (state) | |
4777 | + remove_from_lockqueue(lkb); | |
4778 | + | |
4779 | + switch (state) { | |
4780 | + case GDLM_LQSTATE_WAIT_RSB: | |
4781 | + | |
4782 | + GDLM_ASSERT(reply->rl_status == 0,); | |
4783 | + | |
4784 | + if (reply->rl_nodeid == our_nodeid()) | |
4785 | + rsb->res_nodeid = 0; | |
4786 | + else | |
4787 | + rsb->res_nodeid = reply->rl_nodeid; | |
4788 | + | |
4789 | + rsb->res_resdir_seq = reply->rl_resdir_seq; | |
4790 | + lkb->lkb_nodeid = rsb->res_nodeid; | |
4791 | + | |
4792 | + dlm_lock_stage2(rsb->res_ls, lkb, rsb, | |
4793 | + lkb->lkb_lockqueue_flags); | |
4794 | + break; | |
4795 | + | |
4796 | + case GDLM_LQSTATE_WAIT_CONVERT: | |
4797 | + case GDLM_LQSTATE_WAIT_CONDGRANT: | |
4798 | + | |
4799 | + /* | |
4800 | + * After a remote lock/conversion/grant request we put the lock | |
4801 | + * on the right queue and send an AST if appropriate. Any lock | |
4802 | + * shuffling (eg newly granted locks because this one was | |
4803 | + * converted downwards) will be dealt with in seperate messages | |
4804 | + * (which may be in the same network message) | |
4805 | + */ | |
4806 | + | |
4807 | + if (!lkb->lkb_remid) | |
4808 | + lkb->lkb_remid = reply->rl_lkid; | |
4809 | + | |
4810 | + /* | |
4811 | + * The remote request failed (we assume because of NOQUEUE). | |
4812 | + * If this is a new request (non-conv) the lkb was created just | |
4813 | + * for it so the lkb should be freed. If this was a | |
4814 | + * conversion, the lkb already existed so we should put it back | |
4815 | + * on the grant queue. | |
4816 | + */ | |
4817 | + | |
4818 | + if (reply->rl_status != 0) { | |
4819 | + GDLM_ASSERT(reply->rl_status == -EAGAIN,); | |
4820 | + | |
4821 | + if (state == GDLM_LQSTATE_WAIT_CONDGRANT) { | |
4822 | + res_lkb_dequeue(lkb); | |
5cdbd17b AM |
4823 | + lkb->lkb_retstatus = reply->rl_status; |
4824 | + queue_ast(lkb, AST_COMP | AST_DEL, 0); | |
4825 | + } else { | |
4bf12011 | 4826 | + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); |
5cdbd17b AM |
4827 | + lkb->lkb_retstatus = reply->rl_status; |
4828 | + queue_ast(lkb, AST_COMP, 0); | |
4829 | + } | |
4bf12011 | 4830 | + break; |
4831 | + } | |
4832 | + | |
4833 | + /* | |
4834 | + * The remote request was successful in granting the request or | |
4835 | + * queuing it to be granted later. Add the lkb to the | |
4836 | + * appropriate rsb queue. | |
4837 | + */ | |
4838 | + | |
4839 | + switch (reply->rl_lockstate) { | |
4840 | + case GDLM_LKSTS_GRANTED: | |
4841 | + | |
4842 | + /* Compact version of grant_lock(). */ | |
4843 | + | |
4844 | + down_write(&rsb->res_lock); | |
4845 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) | |
4846 | + memcpy(lkb->lkb_lvbptr, reply->rl_lvb, | |
4847 | + DLM_LVB_LEN); | |
4848 | + | |
4849 | + lkb->lkb_grmode = lkb->lkb_rqmode; | |
4850 | + lkb->lkb_rqmode = DLM_LOCK_IV; | |
4851 | + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); | |
4852 | + | |
4853 | + if (lkb->lkb_range) { | |
4854 | + lkb->lkb_range[GR_RANGE_START] = | |
4855 | + lkb->lkb_range[RQ_RANGE_START]; | |
4856 | + lkb->lkb_range[GR_RANGE_END] = | |
4857 | + lkb->lkb_range[RQ_RANGE_END]; | |
4858 | + } | |
4859 | + up_write(&rsb->res_lock); | |
4860 | + | |
4861 | + lkb->lkb_retstatus = 0; | |
5cdbd17b | 4862 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 4863 | + break; |
4864 | + | |
4865 | + case GDLM_LKSTS_WAITING: | |
4866 | + | |
4867 | + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) | |
4868 | + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING); | |
4869 | + else | |
4870 | + log_error(ls, "wait reply for granted %x %u", | |
4871 | + lkb->lkb_id, lkb->lkb_nodeid); | |
4872 | + break; | |
4873 | + | |
4874 | + case GDLM_LKSTS_CONVERT: | |
4875 | + | |
4876 | + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) | |
4877 | + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT); | |
4878 | + else | |
4879 | + log_error(ls, "convert reply for granted %x %u", | |
4880 | + lkb->lkb_id, lkb->lkb_nodeid); | |
4881 | + break; | |
4882 | + | |
4883 | + default: | |
4884 | + log_error(ls, "process_lockqueue_reply state %d", | |
4885 | + reply->rl_lockstate); | |
4886 | + } | |
4887 | + | |
4888 | + break; | |
4889 | + | |
4890 | + case GDLM_LQSTATE_WAIT_UNLOCK: | |
4891 | + | |
4892 | + /* | |
4893 | + * Unlocks should never fail. Update local lock info. This | |
4894 | + * always sends completion AST with status in lksb | |
4895 | + */ | |
4896 | + | |
4897 | + GDLM_ASSERT(reply->rl_status == 0,); | |
4898 | + oldstate = res_lkb_dequeue(lkb); | |
4899 | + | |
4900 | + /* Differentiate between unlocks and conversion cancellations */ | |
4901 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL && | |
4902 | + oldstate == GDLM_LKSTS_CONVERT) { | |
4903 | + res_lkb_enqueue(lkb->lkb_resource, lkb, | |
4904 | + GDLM_LKSTS_GRANTED); | |
4905 | + lkb->lkb_retstatus = -DLM_ECANCEL; | |
5cdbd17b | 4906 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 4907 | + } else { |
4bf12011 | 4908 | + lkb->lkb_retstatus = -DLM_EUNLOCK; |
5cdbd17b | 4909 | + queue_ast(lkb, AST_COMP | AST_DEL, 0); |
4bf12011 | 4910 | + } |
4bf12011 | 4911 | + break; |
4912 | + | |
4913 | + default: | |
4914 | + log_error(ls, "process_lockqueue_reply id %x state %d", | |
4915 | + lkb->lkb_id, state); | |
4916 | + } | |
4917 | +} | |
4918 | + | |
4919 | +/* | |
4920 | + * Tell a remote node to grant a lock. This happens when we are the master | |
4921 | + * copy for a lock that is actually held on a remote node. The remote end is | |
4922 | + * also responsible for sending the completion AST. | |
4923 | + */ | |
4924 | + | |
4925 | +void remote_grant(gd_lkb_t *lkb) | |
4926 | +{ | |
4927 | + struct writequeue_entry *e; | |
4928 | + struct gd_remlockrequest *req; | |
4929 | + | |
4930 | + // TODO Error handling | |
4931 | + e = lowcomms_get_buffer(lkb->lkb_nodeid, | |
4932 | + sizeof(struct gd_remlockrequest), | |
4933 | + lkb->lkb_resource->res_ls->ls_allocation, | |
4934 | + (char **) &req); | |
4935 | + if (!e) | |
4936 | + return; | |
4937 | + | |
4938 | + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT; | |
4939 | + req->rr_header.rh_length = sizeof(struct gd_remlockrequest); | |
4940 | + req->rr_header.rh_flags = 0; | |
4941 | + req->rr_header.rh_lkid = lkb->lkb_id; | |
4942 | + req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id; | |
4943 | + req->rr_remlkid = lkb->lkb_remid; | |
4944 | + req->rr_flags = 0; | |
4945 | + | |
4946 | + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) { | |
4947 | + /* This is a confusing non-standard use of rr_flags which is | |
4948 | + * usually used to pass lockqueue_flags. */ | |
4949 | + req->rr_flags |= GDLM_LKFLG_DEMOTED; | |
4950 | + } | |
4951 | + | |
4952 | + add_request_lvb(lkb, req); | |
4953 | + midcomms_send_buffer(&req->rr_header, e); | |
4954 | +} | |
4955 | + | |
4956 | +void reply_and_grant(gd_lkb_t *lkb) | |
4957 | +{ | |
4958 | + struct gd_remlockrequest *req = lkb->lkb_request; | |
4959 | + struct gd_remlockreply *reply; | |
4960 | + struct writequeue_entry *e; | |
4961 | + | |
4962 | + // TODO Error handling | |
4963 | + e = lowcomms_get_buffer(lkb->lkb_nodeid, | |
4964 | + sizeof(struct gd_remlockreply), | |
4965 | + lkb->lkb_resource->res_ls->ls_allocation, | |
4966 | + (char **) &reply); | |
4967 | + if (!e) | |
4968 | + return; | |
4969 | + | |
4970 | + reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY; | |
4971 | + reply->rl_header.rh_flags = 0; | |
4972 | + reply->rl_header.rh_length = sizeof(struct gd_remlockreply); | |
4973 | + reply->rl_header.rh_lkid = req->rr_header.rh_lkid; | |
4974 | + reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace; | |
4975 | + | |
4976 | + reply->rl_status = lkb->lkb_retstatus; | |
4977 | + reply->rl_lockstate = lkb->lkb_status; | |
4978 | + reply->rl_lkid = lkb->lkb_id; | |
4979 | + | |
4980 | + GDLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),); | |
4981 | + | |
4982 | + lkb->lkb_request = NULL; | |
4983 | + | |
4984 | + add_reply_lvb(lkb, reply); | |
4985 | + midcomms_send_buffer(&reply->rl_header, e); | |
4986 | +} | |
4987 | + | |
4988 | +/* | |
4989 | + * Request removal of a dead entry in the resource directory | |
4990 | + */ | |
4991 | + | |
4992 | +void remote_remove_resdata(gd_ls_t *ls, int nodeid, char *name, int namelen, | |
4993 | + uint8_t sequence) | |
4994 | +{ | |
4995 | + struct writequeue_entry *e; | |
4996 | + struct gd_remlockrequest *req; | |
4997 | + | |
4998 | + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { | |
4999 | + gd_rcom_t *rc = allocate_rcom_buffer(ls); | |
5000 | + | |
5001 | + memcpy(rc->rc_buf, name, namelen); | |
5002 | + rc->rc_datalen = namelen; | |
5003 | + | |
5004 | + rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0); | |
5005 | + | |
5006 | + free_rcom_buffer(rc); | |
5007 | + return; | |
5008 | + } | |
5009 | + // TODO Error handling | |
5010 | + e = lowcomms_get_buffer(nodeid, | |
5011 | + sizeof(struct gd_remlockrequest) + namelen - 1, | |
5012 | + ls->ls_allocation, (char **) &req); | |
5013 | + if (!e) | |
5014 | + return; | |
5015 | + | |
5016 | + memset(req, 0, sizeof(struct gd_remlockrequest) + namelen - 1); | |
5017 | + req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA; | |
5018 | + req->rr_header.rh_length = | |
5019 | + sizeof(struct gd_remlockrequest) + namelen - 1; | |
5020 | + req->rr_header.rh_flags = 0; | |
5021 | + req->rr_header.rh_lkid = 0; | |
5022 | + req->rr_header.rh_lockspace = ls->ls_global_id; | |
5023 | + req->rr_remlkid = 0; | |
5024 | + req->rr_resdir_seq = sequence; | |
5025 | + memcpy(req->rr_name, name, namelen); | |
5026 | + | |
5027 | + midcomms_send_buffer(&req->rr_header, e); | |
5028 | +} | |
5029 | + | |
5030 | +/* | |
5031 | + * Send remote cluster request to directory or master node before the request | |
5032 | + * is put on the lock queue. Runs in the context of the locking caller. | |
5033 | + */ | |
5034 | + | |
5035 | +int send_cluster_request(gd_lkb_t *lkb, int state) | |
5036 | +{ | |
5037 | + uint32_t target_nodeid; | |
5038 | + gd_res_t *rsb = lkb->lkb_resource; | |
5039 | + gd_ls_t *ls = rsb->res_ls; | |
5040 | + struct gd_remlockrequest *req; | |
5041 | + struct writequeue_entry *e; | |
5042 | + | |
5043 | + /* Need to know the target nodeid before we allocate a send buffer */ | |
5044 | + target_nodeid = lkb->lkb_nodeid; | |
5045 | + GDLM_ASSERT(target_nodeid != 0,); | |
5046 | + | |
5047 | + if (state == GDLM_LQSTATE_WAIT_RSB) | |
5048 | + target_nodeid = get_directory_nodeid(rsb); | |
5049 | + | |
5050 | + GDLM_ASSERT(target_nodeid,); | |
5051 | + | |
5052 | + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { | |
5053 | + /* this may happen when called by resend_cluster_request */ | |
5054 | + log_error(ls, "send_cluster_request to %u state %d recovery", | |
5055 | + target_nodeid, state); | |
5056 | + } | |
5057 | + | |
5058 | + e = lowcomms_get_buffer(target_nodeid, | |
5059 | + sizeof(struct gd_remlockrequest) + | |
5060 | + rsb->res_length - 1, ls->ls_allocation, | |
5061 | + (char **) &req); | |
5062 | + if (!e) | |
5063 | + return -ENOBUFS; | |
5064 | + memset(req, 0, sizeof(struct gd_remlockrequest) + rsb->res_length - 1); | |
5065 | + | |
5066 | + /* Common stuff, some are just defaults */ | |
5067 | + | |
5068 | + if (lkb->lkb_bastaddr) | |
5cdbd17b | 5069 | + req->rr_asts = AST_BAST; |
4bf12011 | 5070 | + if (lkb->lkb_astaddr) |
5cdbd17b | 5071 | + req->rr_asts |= AST_COMP; |
4bf12011 | 5072 | + if (lkb->lkb_parent) |
5073 | + req->rr_remparid = lkb->lkb_parent->lkb_remid; | |
5074 | + | |
5075 | + req->rr_flags = lkb->lkb_lockqueue_flags; | |
5076 | + req->rr_rqmode = lkb->lkb_rqmode; | |
5077 | + req->rr_remlkid = lkb->lkb_remid; | |
5078 | + req->rr_header.rh_length = | |
5079 | + sizeof(struct gd_remlockrequest) + rsb->res_length - 1; | |
5080 | + req->rr_header.rh_flags = 0; | |
5081 | + req->rr_header.rh_lkid = lkb->lkb_id; | |
5082 | + req->rr_header.rh_lockspace = ls->ls_global_id; | |
5083 | + | |
5084 | + switch (state) { | |
5085 | + | |
5086 | + case GDLM_LQSTATE_WAIT_RSB: | |
5087 | + | |
5088 | + /* The lock must be a root lock */ | |
5089 | + GDLM_ASSERT(!lkb->lkb_parent,); | |
5090 | + | |
5091 | + req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP; | |
5092 | + memcpy(req->rr_name, rsb->res_name, rsb->res_length); | |
5093 | + break; | |
5094 | + | |
5095 | + case GDLM_LQSTATE_WAIT_CONVERT: | |
5096 | + | |
5097 | + req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST; | |
5098 | + if (lkb->lkb_range) { | |
5099 | + req->rr_flags |= GDLM_LKFLG_RANGE; | |
5100 | + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START]; | |
5101 | + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END]; | |
5102 | + } | |
5103 | + break; | |
5104 | + | |
5105 | + case GDLM_LQSTATE_WAIT_CONDGRANT: | |
5106 | + | |
5107 | + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST; | |
5108 | + req->rr_resdir_seq = rsb->res_resdir_seq; | |
5109 | + memcpy(req->rr_name, rsb->res_name, rsb->res_length); | |
5110 | + if (lkb->lkb_range) { | |
5111 | + req->rr_flags |= GDLM_LKFLG_RANGE; | |
5112 | + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START]; | |
5113 | + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END]; | |
5114 | + } | |
5115 | + break; | |
5116 | + | |
5117 | + case GDLM_LQSTATE_WAIT_UNLOCK: | |
5118 | + | |
5119 | + req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST; | |
5120 | + break; | |
5121 | + | |
5122 | + default: | |
5123 | + GDLM_ASSERT(!"Unknown cluster request",); | |
5124 | + } | |
5125 | + | |
5126 | + add_request_lvb(lkb, req); | |
5127 | + midcomms_send_buffer(&req->rr_header, e); | |
5128 | + | |
5129 | + return 0; | |
5130 | +} | |
5131 | + | |
5132 | +/* | |
5133 | + * We got a request from another cluster node, process it and return an info | |
5134 | + * structure with the lock state/LVB etc as required. Executes in the DLM's | |
5135 | + * recvd thread. | |
5136 | + */ | |
5137 | + | |
5138 | +int process_cluster_request(int nodeid, struct gd_req_header *req, int recovery) | |
5139 | +{ | |
5140 | + gd_ls_t *lspace; | |
5141 | + gd_lkb_t *lkb = NULL; | |
5142 | + gd_res_t *rsb; | |
5143 | + int send_reply = 0, status = 0, namelen; | |
5144 | + struct gd_remlockrequest *freq = (struct gd_remlockrequest *) req; | |
5145 | + struct gd_remlockreply reply; | |
5146 | + | |
5147 | + lspace = find_lockspace_by_global_id(req->rh_lockspace); | |
5148 | + | |
5149 | + if (!lspace) { | |
5150 | + log_print("process_cluster_request invalid lockspace %x " | |
5151 | + "from %d req %u", req->rh_lockspace, nodeid, | |
5152 | + req->rh_cmd); | |
5153 | + status = -EINVAL; | |
5154 | + goto out; | |
5155 | + } | |
5156 | + | |
5157 | + /* wait for recoverd to drain requestqueue */ | |
5158 | + if (!recovery) | |
5159 | + wait_requestqueue(lspace); | |
5160 | + | |
5161 | + /* | |
5162 | + * If we're in recovery then queue the request for later. Otherwise, | |
5163 | + * we still need to get the "in_recovery" lock to make sure the | |
5164 | + * recovery itself doesn't start until we are done. | |
5165 | + */ | |
5166 | + retry: | |
5167 | + if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) { | |
5168 | + if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags)) | |
5169 | + log_error(lspace, "process_cluster_request warning %u", | |
5170 | + nodeid); | |
5171 | + add_to_requestqueue(lspace, nodeid, (char *) req, | |
5172 | + req->rh_length); | |
5173 | + log_debug(lspace, "process_cluster_request abort"); | |
5174 | + status = -EINTR; | |
5175 | + goto out; | |
5176 | + } | |
5177 | + if (!down_read_trylock(&lspace->ls_in_recovery)) { | |
5178 | + schedule(); | |
5179 | + goto retry; | |
5180 | + } | |
5181 | + | |
5182 | + | |
5183 | + /* | |
5184 | + * Process the request. | |
5185 | + */ | |
5186 | + | |
5187 | + switch (req->rh_cmd) { | |
5188 | + | |
5189 | + case GDLM_REMCMD_LOOKUP: | |
5190 | + { | |
5191 | + gd_resdata_t *rd; | |
5192 | + int status; | |
5193 | + uint32_t dir_nodeid; | |
5194 | + | |
5195 | + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1; | |
5196 | + | |
5197 | + dir_nodeid = name_to_directory_nodeid(lspace, | |
5198 | + freq->rr_name, | |
5199 | + namelen); | |
5200 | + if (dir_nodeid != our_nodeid()) | |
5201 | + log_debug(lspace, "ignoring directory lookup"); | |
5202 | + | |
5203 | + status = get_resdata(lspace, nodeid, freq->rr_name, | |
5204 | + namelen, &rd, 0); | |
5205 | + if (status) | |
5206 | + status = -ENOMEM; | |
5207 | + | |
5208 | + reply.rl_status = status; | |
5209 | + reply.rl_lockstate = 0; | |
5210 | + reply.rl_nodeid = rd->rd_master_nodeid; | |
5211 | + reply.rl_resdir_seq = rd->rd_sequence; | |
5212 | + } | |
5213 | + send_reply = 1; | |
5214 | + break; | |
5215 | + | |
5216 | + case GDLM_REMCMD_REM_RESDATA: | |
5217 | + | |
5218 | + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1; | |
5219 | + remove_resdata(lspace, nodeid, freq->rr_name, namelen, | |
5220 | + freq->rr_resdir_seq); | |
5221 | + break; | |
5222 | + | |
5223 | + case GDLM_REMCMD_LOCKREQUEST: | |
5224 | + | |
5225 | + lkb = remote_stage2(nodeid, lspace, freq); | |
5226 | + if (lkb) { | |
5227 | + lkb->lkb_request = freq; | |
5228 | + dlm_lock_stage3(lkb); | |
5229 | + | |
5230 | + /* | |
5231 | + * If the request was granted in lock_stage3, then a | |
5232 | + * reply message was already sent in combination with | |
5233 | + * the grant message and lkb_request is NULL. | |
5234 | + */ | |
5235 | + | |
5236 | + if (lkb->lkb_request) { | |
5237 | + lkb->lkb_request = NULL; | |
5238 | + send_reply = 1; | |
5239 | + reply.rl_status = lkb->lkb_retstatus; | |
5240 | + reply.rl_lockstate = lkb->lkb_status; | |
5241 | + reply.rl_lkid = lkb->lkb_id; | |
5242 | + | |
5243 | + /* | |
5244 | + * If the request could not be granted and the | |
5245 | + * user won't wait, then free up the LKB | |
5246 | + */ | |
5247 | + | |
5cdbd17b AM |
5248 | + if (lkb->lkb_retstatus == -EAGAIN) { |
5249 | + GDLM_ASSERT(lkb->lkb_lockqueue_flags & | |
5250 | + DLM_LKF_NOQUEUE,); | |
4bf12011 | 5251 | + rsb = lkb->lkb_resource; |
5252 | + release_lkb(lspace, lkb); | |
5253 | + release_rsb(rsb); | |
5254 | + lkb = NULL; | |
5255 | + } | |
5256 | + } | |
5257 | + } else { | |
5258 | + reply.rl_status = -ENOMEM; | |
5259 | + send_reply = 1; | |
5260 | + } | |
5261 | + break; | |
5262 | + | |
5263 | + case GDLM_REMCMD_CONVREQUEST: | |
5264 | + | |
5265 | + lkb = find_lock_by_id(lspace, freq->rr_remlkid); | |
5266 | + | |
5267 | + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n", | |
5268 | + freq->rr_remlkid, | |
5269 | + freq->rr_header.rh_lkid, nodeid);); | |
5270 | + | |
5271 | + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) | |
5272 | + log_error(lspace, "convrequest: invalid status %d", | |
5273 | + lkb->lkb_status); | |
5274 | + | |
5275 | + lkb->lkb_rqmode = freq->rr_rqmode; | |
5276 | + lkb->lkb_lockqueue_flags = freq->rr_flags; | |
5277 | + lkb->lkb_request = freq; | |
5278 | + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED; | |
5279 | + | |
5280 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK | |
5281 | + || freq->rr_flags & DLM_LKF_VALBLK) { | |
5282 | + lkb->lkb_flags |= GDLM_LKFLG_VALBLK; | |
5283 | + allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr, | |
5284 | + freq->rr_lvb); | |
5285 | + } | |
5286 | + | |
5287 | + if (freq->rr_flags & GDLM_LKFLG_RANGE) { | |
5288 | + if (lkb_set_range(lspace, lkb, freq->rr_range_start, | |
5289 | + freq->rr_range_end)) { | |
5290 | + reply.rl_status = -ENOMEM; | |
5291 | + send_reply = 1; | |
5292 | + goto out; | |
5293 | + } | |
5294 | + } | |
5295 | + | |
5296 | + dlm_convert_stage2(lkb, FALSE); | |
5297 | + | |
5298 | + /* | |
5299 | + * If the conv request was granted in stage2, then a reply | |
5300 | + * message was already sent in combination with the grant | |
5301 | + * message. | |
5302 | + */ | |
5303 | + | |
5304 | + if (lkb->lkb_request) { | |
5305 | + lkb->lkb_request = NULL; | |
5306 | + send_reply = 1; | |
5307 | + reply.rl_status = lkb->lkb_retstatus; | |
5308 | + reply.rl_lockstate = lkb->lkb_status; | |
5309 | + reply.rl_lkid = lkb->lkb_id; | |
5310 | + } | |
5311 | + break; | |
5312 | + | |
5313 | + case GDLM_REMCMD_LOCKREPLY: | |
5314 | + | |
5315 | + lkb = find_lock_by_id(lspace, freq->rr_header.rh_lkid); | |
5316 | + | |
5317 | + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n", | |
5318 | + freq->rr_remlkid, | |
5319 | + freq->rr_header.rh_lkid, nodeid);); | |
5320 | + | |
5321 | + process_lockqueue_reply(lkb, (struct gd_remlockreply *) req); | |
5322 | + break; | |
5323 | + | |
5324 | + case GDLM_REMCMD_LOCKGRANT: | |
5325 | + | |
5326 | + /* | |
5327 | + * Remote lock has been granted asynchronously. Do a compact | |
5328 | + * version of what grant_lock() does. | |
5329 | + */ | |
5330 | + | |
5331 | + lkb = find_lock_by_id(lspace, freq->rr_remlkid); | |
5332 | + | |
5333 | + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n", | |
5334 | + freq->rr_remlkid, | |
5335 | + freq->rr_header.rh_lkid, nodeid);); | |
5336 | + | |
5337 | + rsb = lkb->lkb_resource; | |
5338 | + | |
5339 | + if (lkb->lkb_lockqueue_state) | |
5340 | + log_error(rsb->res_ls, "granting lock on lockqueue " | |
5341 | + "id=%x from=%u lqstate=%d flags=%x", | |
5342 | + lkb->lkb_id, nodeid, lkb->lkb_lockqueue_state, | |
5343 | + lkb->lkb_flags); | |
5344 | + | |
5345 | + down_write(&rsb->res_lock); | |
5346 | + | |
5347 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) | |
5348 | + memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN); | |
5349 | + | |
5350 | + lkb->lkb_grmode = lkb->lkb_rqmode; | |
5351 | + lkb->lkb_rqmode = DLM_LOCK_IV; | |
5352 | + | |
5353 | + if (lkb->lkb_range) { | |
5354 | + lkb->lkb_range[GR_RANGE_START] = | |
5355 | + lkb->lkb_range[RQ_RANGE_START]; | |
5356 | + lkb->lkb_range[GR_RANGE_END] = | |
5357 | + lkb->lkb_range[RQ_RANGE_END]; | |
5358 | + } | |
5359 | + | |
5360 | + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); | |
5361 | + up_write(&rsb->res_lock); | |
5362 | + | |
5363 | + if (freq->rr_flags & GDLM_LKFLG_DEMOTED) | |
5364 | + lkb->lkb_flags |= GDLM_LKFLG_DEMOTED; | |
5365 | + | |
5366 | + lkb->lkb_retstatus = 0; | |
5cdbd17b | 5367 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 5368 | + break; |
5369 | + | |
5370 | + case GDLM_REMCMD_SENDBAST: | |
5371 | + | |
5372 | + lkb = find_lock_by_id(lspace, freq->rr_remlkid); | |
5373 | + | |
5374 | + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n", | |
5375 | + freq->rr_remlkid, | |
5376 | + freq->rr_header.rh_lkid, nodeid);); | |
5377 | + | |
5378 | + if (lkb->lkb_status == GDLM_LKSTS_GRANTED) | |
5cdbd17b | 5379 | + queue_ast(lkb, AST_BAST, freq->rr_rqmode); |
4bf12011 | 5380 | + break; |
5381 | + | |
5382 | + case GDLM_REMCMD_SENDCAST: | |
5383 | + | |
5384 | + /* This is only used for some error completion ASTs */ | |
5385 | + | |
5386 | + lkb = find_lock_by_id(lspace, freq->rr_remlkid); | |
5387 | + | |
5388 | + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n", | |
5389 | + freq->rr_remlkid, | |
5390 | + freq->rr_header.rh_lkid, nodeid);); | |
5391 | + | |
5392 | + /* Return the lock to granted status */ | |
5393 | + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED); | |
5394 | + | |
5395 | + lkb->lkb_retstatus = freq->rr_status; | |
5cdbd17b | 5396 | + queue_ast(lkb, AST_COMP, 0); |
4bf12011 | 5397 | + break; |
5398 | + | |
5399 | + case GDLM_REMCMD_UNLOCKREQUEST: | |
5400 | + | |
5401 | + lkb = find_lock_by_id(lspace, freq->rr_remlkid); | |
5402 | + | |
5403 | + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n", | |
5404 | + freq->rr_remlkid, | |
5405 | + freq->rr_header.rh_lkid, nodeid);); | |
5406 | + | |
5407 | + reply.rl_status = dlm_unlock_stage2(lkb, freq->rr_flags); | |
5408 | + send_reply = 1; | |
5409 | + break; | |
5410 | + | |
5411 | + case GDLM_REMCMD_QUERY: | |
5412 | + remote_query(nodeid, lspace, req); | |
5413 | + break; | |
5414 | + | |
5415 | + case GDLM_REMCMD_QUERYREPLY: | |
5416 | + remote_query_reply(nodeid, lspace, req); | |
5417 | + break; | |
5418 | + | |
5419 | + default: | |
5420 | + log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd); | |
5421 | + } | |
5422 | + | |
5423 | + up_read(&lspace->ls_in_recovery); | |
5424 | + | |
5425 | + out: | |
5426 | + if (send_reply) { | |
5427 | + reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY; | |
5428 | + reply.rl_header.rh_flags = 0; | |
5429 | + reply.rl_header.rh_length = sizeof(reply); | |
5430 | + reply.rl_header.rh_lkid = freq->rr_header.rh_lkid; | |
5431 | + reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace; | |
5432 | + | |
5433 | + status = midcomms_send_message(nodeid, &reply.rl_header, | |
5434 | + GFP_KERNEL); | |
5435 | + } | |
5436 | + | |
5437 | + wake_astd(); | |
5438 | + | |
5439 | + return status; | |
5440 | +} | |
5441 | + | |
5442 | +static void add_reply_lvb(gd_lkb_t *lkb, struct gd_remlockreply *reply) | |
5443 | +{ | |
5444 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) | |
5445 | + memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN); | |
5446 | +} | |
5447 | + | |
5448 | +static void add_request_lvb(gd_lkb_t *lkb, struct gd_remlockrequest *req) | |
5449 | +{ | |
5450 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) | |
5451 | + memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN); | |
5452 | +} | |
5453 | diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h | |
5454 | --- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 5455 | +++ linux-patched/cluster/dlm/lockqueue.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 5456 | @@ -0,0 +1,29 @@ |
5457 | +/****************************************************************************** | |
5458 | +******************************************************************************* | |
5459 | +** | |
5460 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
5461 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
5462 | +** | |
5463 | +** This copyrighted material is made available to anyone wishing to use, | |
5464 | +** modify, copy, or redistribute it subject to the terms and conditions | |
5465 | +** of the GNU General Public License v.2. | |
5466 | +** | |
5467 | +******************************************************************************* | |
5468 | +******************************************************************************/ | |
5469 | + | |
5470 | +#ifndef __LOCKQUEUE_DOT_H__ | |
5471 | +#define __LOCKQUEUE_DOT_H__ | |
5472 | + | |
5473 | +void remote_grant(gd_lkb_t * lkb); | |
5474 | +void reply_and_grant(gd_lkb_t * lkb); | |
5475 | +int remote_stage(gd_lkb_t * lkb, int state); | |
5476 | +int process_cluster_request(int csid, struct gd_req_header *req, int recovery); | |
5477 | +int send_cluster_request(gd_lkb_t * lkb, int state); | |
5478 | +void purge_requestqueue(gd_ls_t * ls); | |
5479 | +int process_requestqueue(gd_ls_t * ls); | |
5480 | +int reply_in_requestqueue(gd_ls_t * ls, int lkid); | |
5481 | +void remote_remove_resdata(gd_ls_t * ls, int nodeid, char *name, int namelen, | |
5482 | + uint8_t sequence); | |
5483 | +void allocate_and_copy_lvb(gd_ls_t * ls, char **lvbptr, char *src); | |
5484 | + | |
5485 | +#endif /* __LOCKQUEUE_DOT_H__ */ | |
5486 | diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c | |
5487 | --- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 5488 | +++ linux-patched/cluster/dlm/lockspace.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 5489 | @@ -0,0 +1,706 @@ |
5490 | +/****************************************************************************** | |
5491 | +******************************************************************************* | |
5492 | +** | |
5493 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
5494 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
5495 | +** | |
5496 | +** This copyrighted material is made available to anyone wishing to use, | |
5497 | +** modify, copy, or redistribute it subject to the terms and conditions | |
5498 | +** of the GNU General Public License v.2. | |
5499 | +** | |
5500 | +******************************************************************************* | |
5501 | +******************************************************************************/ | |
5502 | + | |
5503 | +#include <linux/module.h> | |
5504 | + | |
5505 | +#include "dlm_internal.h" | |
5506 | +#include "recoverd.h" | |
5507 | +#include "ast.h" | |
5508 | +#include "lkb.h" | |
5509 | +#include "nodes.h" | |
5510 | +#include "dir.h" | |
5511 | +#include "lowcomms.h" | |
5512 | +#include "config.h" | |
5513 | +#include "memory.h" | |
5514 | +#include "lockspace.h" | |
5515 | +#include "device.h" | |
5516 | + | |
5517 | +#define GDST_NONE (0) | |
5518 | +#define GDST_RUNNING (1) | |
5519 | + | |
5520 | +static int gdlmstate; | |
5521 | +static int gdlmcount; | |
5522 | +static struct semaphore gdlmstate_lock; | |
5523 | +struct list_head lslist; | |
5524 | +spinlock_t lslist_lock; | |
5525 | +struct kcl_service_ops ls_ops; | |
5526 | + | |
5527 | +static int new_lockspace(char *name, int namelen, void **lockspace, int flags); | |
5528 | + | |
5529 | + | |
5530 | +void dlm_lockspace_init(void) | |
5531 | +{ | |
5532 | + gdlmstate = GDST_NONE; | |
5533 | + gdlmcount = 0; | |
5534 | + init_MUTEX(&gdlmstate_lock); | |
5535 | + INIT_LIST_HEAD(&lslist); | |
5536 | + spin_lock_init(&lslist_lock); | |
5537 | +} | |
5538 | + | |
5539 | +gd_ls_t *find_lockspace_by_global_id(uint32_t id) | |
5540 | +{ | |
5541 | + gd_ls_t *ls; | |
5542 | + | |
5543 | + spin_lock(&lslist_lock); | |
5544 | + | |
5545 | + list_for_each_entry(ls, &lslist, ls_list) { | |
5546 | + if (ls->ls_global_id == id) | |
5547 | + goto out; | |
5548 | + } | |
5549 | + ls = NULL; | |
5550 | + out: | |
5551 | + spin_unlock(&lslist_lock); | |
5552 | + return ls; | |
5553 | +} | |
5554 | + | |
5555 | +/* TODO: make this more efficient */ | |
5556 | +gd_ls_t *find_lockspace_by_local_id(void *id) | |
5557 | +{ | |
5558 | + gd_ls_t *ls; | |
5559 | + | |
5560 | + spin_lock(&lslist_lock); | |
5561 | + | |
5562 | + list_for_each_entry(ls, &lslist, ls_list) { | |
5563 | + if (ls->ls_local_id == (uint32_t)(long)id) | |
5564 | + goto out; | |
5565 | + } | |
5566 | + ls = NULL; | |
5567 | + out: | |
5568 | + spin_unlock(&lslist_lock); | |
5569 | + return ls; | |
5570 | +} | |
5571 | + | |
5572 | +gd_ls_t *find_lockspace_by_name(char *name, int namelen) | |
5573 | +{ | |
5574 | + gd_ls_t *ls; | |
5575 | + | |
5576 | + spin_lock(&lslist_lock); | |
5577 | + | |
5578 | + list_for_each_entry(ls, &lslist, ls_list) { | |
5579 | + if (ls->ls_namelen == namelen && | |
5580 | + memcmp(ls->ls_name, name, namelen) == 0) | |
5581 | + goto out; | |
5582 | + } | |
5583 | + ls = NULL; | |
5584 | + out: | |
5585 | + spin_unlock(&lslist_lock); | |
5586 | + return ls; | |
5587 | +} | |
5588 | + | |
5589 | +/* | |
5590 | + * Called from dlm_init. These are the general threads which are not | |
5591 | + * lockspace-specific and work for all gdlm lockspaces. | |
5592 | + */ | |
5593 | + | |
5594 | +static int threads_start(void) | |
5595 | +{ | |
5596 | + int error; | |
5597 | + | |
5598 | + /* Thread which interacts with cman for all ls's */ | |
5599 | + error = recoverd_start(); | |
5600 | + if (error) { | |
5601 | + log_print("cannot start recovery thread %d", error); | |
5602 | + goto fail; | |
5603 | + } | |
5604 | + | |
5605 | + /* Thread which process lock requests for all ls's */ | |
5606 | + error = astd_start(); | |
5607 | + if (error) { | |
5608 | + log_print("cannot start ast thread %d", error); | |
5609 | + goto recoverd_fail; | |
5610 | + } | |
5611 | + | |
5612 | + /* Thread for sending/receiving messages for all ls's */ | |
5613 | + error = lowcomms_start(); | |
5614 | + if (error) { | |
5615 | + log_print("cannot start lowcomms %d", error); | |
5616 | + goto astd_fail; | |
5617 | + } | |
5618 | + | |
5619 | + return 0; | |
5620 | + | |
5621 | + astd_fail: | |
5622 | + astd_stop(); | |
5623 | + | |
5624 | + recoverd_fail: | |
5625 | + recoverd_stop(); | |
5626 | + | |
5627 | + fail: | |
5628 | + return error; | |
5629 | +} | |
5630 | + | |
5631 | +static void threads_stop(void) | |
5632 | +{ | |
5633 | + lowcomms_stop(); | |
5634 | + astd_stop(); | |
5635 | + recoverd_stop(); | |
5636 | +} | |
5637 | + | |
5638 | +static int init_internal(void) | |
5639 | +{ | |
5640 | + int error = 0; | |
5641 | + | |
5642 | + if (gdlmstate == GDST_RUNNING) | |
5643 | + gdlmcount++; | |
5644 | + else { | |
5645 | + error = threads_start(); | |
5646 | + if (error) | |
5647 | + goto out; | |
5648 | + | |
5649 | + gdlmstate = GDST_RUNNING; | |
5650 | + gdlmcount = 1; | |
5651 | + } | |
5652 | + | |
5653 | + out: | |
5654 | + return error; | |
5655 | +} | |
5656 | + | |
5657 | + | |
5658 | +/* | |
5659 | + * Called after gdlm module is loaded and before any lockspaces are created. | |
5660 | + * Starts and initializes global threads and structures. These global entities | |
5661 | + * are shared by and independent of all lockspaces. | |
5662 | + * | |
5663 | + * There should be a gdlm-specific user command which a person can run which | |
5664 | + * calls this function. If a user hasn't run that command and something | |
5665 | + * creates a new lockspace, this is called first. | |
5666 | + * | |
5667 | + * This also starts the default lockspace. | |
5668 | + */ | |
5669 | + | |
5670 | +int dlm_init(void) | |
5671 | +{ | |
5672 | + int error; | |
5673 | + | |
5674 | + down(&gdlmstate_lock); | |
5675 | + error = init_internal(); | |
5676 | + up(&gdlmstate_lock); | |
5677 | + | |
5678 | + return error; | |
5679 | +} | |
5680 | + | |
5681 | +int dlm_release(void) | |
5682 | +{ | |
5683 | + int error = 0; | |
5684 | + | |
5685 | + down(&gdlmstate_lock); | |
5686 | + | |
5687 | + if (gdlmstate == GDST_NONE) | |
5688 | + goto out; | |
5689 | + | |
5690 | + if (gdlmcount) | |
5691 | + gdlmcount--; | |
5692 | + | |
5693 | + if (gdlmcount) | |
5694 | + goto out; | |
5695 | + | |
5696 | + spin_lock(&lslist_lock); | |
5697 | + if (!list_empty(&lslist)) { | |
5698 | + spin_unlock(&lslist_lock); | |
5699 | + log_print("cannot stop threads, lockspaces still exist"); | |
5700 | + goto out; | |
5701 | + } | |
5702 | + spin_unlock(&lslist_lock); | |
5703 | + | |
5704 | + threads_stop(); | |
5705 | + gdlmstate = GDST_NONE; | |
5706 | + | |
5707 | + out: | |
5708 | + up(&gdlmstate_lock); | |
5709 | + | |
5710 | + return error; | |
5711 | +} | |
5712 | + | |
5713 | +gd_ls_t *allocate_ls(int namelen) | |
5714 | +{ | |
5715 | + gd_ls_t *ls; | |
5716 | + | |
5717 | + /* FIXME: use appropriate malloc type */ | |
5718 | + | |
5719 | + ls = kmalloc(sizeof(gd_ls_t) + namelen, GFP_KERNEL); | |
5720 | + if (ls) | |
5721 | + memset(ls, 0, sizeof(gd_ls_t) + namelen); | |
5722 | + | |
5723 | + return ls; | |
5724 | +} | |
5725 | + | |
5726 | +void free_ls(gd_ls_t *ls) | |
5727 | +{ | |
5728 | + kfree(ls); | |
5729 | +} | |
5730 | + | |
5731 | +static int new_lockspace(char *name, int namelen, void **lockspace, int flags) | |
5732 | +{ | |
5733 | + gd_ls_t *ls; | |
5734 | + int i, error = -ENOMEM; | |
5735 | + uint32_t local_id = 0; | |
5736 | + | |
5737 | + if (!try_module_get(THIS_MODULE)) | |
5738 | + return -EINVAL; | |
5739 | + | |
5740 | + if (namelen > MAX_SERVICE_NAME_LEN) | |
5741 | + return -EINVAL; | |
5742 | + | |
5743 | + if ((ls = find_lockspace_by_name(name, namelen))) { | |
5744 | + *lockspace = (void *)ls->ls_local_id; | |
5745 | + return -EEXIST; | |
5746 | + } | |
5747 | + | |
5748 | + /* | |
5749 | + * Initialize ls fields | |
5750 | + */ | |
5751 | + | |
5752 | + ls = allocate_ls(namelen); | |
5753 | + if (!ls) | |
5754 | + goto out; | |
5755 | + | |
5756 | + memcpy(ls->ls_name, name, namelen); | |
5757 | + ls->ls_namelen = namelen; | |
5758 | + | |
5759 | + ls->ls_allocation = GFP_KERNEL; | |
5760 | + memset(&ls->ls_flags, 0, sizeof(unsigned long)); | |
5761 | + INIT_LIST_HEAD(&ls->ls_rootres); | |
5762 | + ls->ls_hashsize = dlm_config.reshashtbl; | |
5763 | + ls->ls_hashmask = ls->ls_hashsize - 1; | |
5764 | + | |
5765 | + ls->ls_reshashtbl = | |
5766 | + kmalloc(sizeof(struct list_head) * ls->ls_hashsize, GFP_KERNEL); | |
5767 | + if (!ls->ls_reshashtbl) | |
5768 | + goto out_lsfree; | |
5769 | + | |
5770 | + for (i = 0; i < ls->ls_hashsize; i++) | |
5771 | + INIT_LIST_HEAD(&ls->ls_reshashtbl[i]); | |
5772 | + | |
5773 | + rwlock_init(&ls->ls_reshash_lock); | |
5774 | + | |
5775 | + if (init_lockidtbl(ls, dlm_config.lockidtbl) == -1) | |
5776 | + goto out_htfree; | |
5777 | + | |
5778 | + INIT_LIST_HEAD(&ls->ls_nodes); | |
5779 | + ls->ls_num_nodes = 0; | |
5780 | + INIT_LIST_HEAD(&ls->ls_nodes_gone); | |
5781 | + INIT_LIST_HEAD(&ls->ls_recover); | |
5782 | + spin_lock_init(&ls->ls_recover_lock); | |
5783 | + INIT_LIST_HEAD(&ls->ls_recover_list); | |
5784 | + ls->ls_recover_list_count = 0; | |
5785 | + spin_lock_init(&ls->ls_recover_list_lock); | |
5786 | + init_waitqueue_head(&ls->ls_wait_general); | |
5787 | + INIT_LIST_HEAD(&ls->ls_requestqueue); | |
5788 | + INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list); | |
5789 | + ls->ls_last_stop = 0; | |
5790 | + ls->ls_last_start = 0; | |
5791 | + ls->ls_last_finish = 0; | |
5792 | + ls->ls_rcom_msgid = 0; | |
5793 | + init_MUTEX(&ls->ls_rcom_lock); | |
5794 | + init_rwsem(&ls->ls_in_recovery); | |
5795 | + init_rwsem(&ls->ls_unlock_sem); | |
5796 | + init_rwsem(&ls->ls_rec_rsblist); | |
5797 | + init_rwsem(&ls->ls_gap_rsblist); | |
5798 | + down_write(&ls->ls_in_recovery); | |
5799 | + | |
5800 | + for (i = 0; i < RESDIRHASH_SIZE; i++) { | |
5801 | + INIT_LIST_HEAD(&ls->ls_resdir_hash[i].rb_reslist); | |
5802 | + rwlock_init(&ls->ls_resdir_hash[i].rb_lock); | |
5803 | + } | |
5804 | + | |
5805 | + if (flags & DLM_LSF_NOTIMERS) | |
5806 | + set_bit(LSFL_NOTIMERS, &ls->ls_flags); | |
5807 | + | |
5808 | + /* | |
5809 | + * Connect this lockspace with the cluster manager | |
5810 | + */ | |
5811 | + | |
5812 | + error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM, | |
5813 | + &ls_ops, TRUE, (void *) ls, &local_id); | |
5814 | + if (error) | |
5815 | + goto out_idtblfree; | |
5816 | + | |
5817 | + ls->ls_state = LSST_INIT; | |
5818 | + ls->ls_local_id = local_id; | |
5819 | + | |
5820 | + spin_lock(&lslist_lock); | |
5821 | + list_add(&ls->ls_list, &lslist); | |
5822 | + spin_unlock(&lslist_lock); | |
5823 | + | |
5824 | + error = kcl_join_service(local_id); | |
5825 | + if (error) { | |
5826 | + log_error(ls, "service manager join error %d", error); | |
5827 | + goto out_reg; | |
5828 | + } | |
5829 | + | |
5830 | + /* The ls isn't actually running until it receives a start() from CMAN. | |
5831 | + * Neither does it have a global ls id until started. */ | |
5832 | + | |
5833 | + | |
5834 | + /* Return the local ID as the lockspace handle. I've left this | |
5835 | + cast to a void* as it allows us to replace it with pretty much | |
5836 | + anything at a future date without breaking clients. But returning | |
5837 | + the address of the lockspace is a bad idea as it could get | |
5838 | + forcibly removed, leaving client with a dangling pointer */ | |
5839 | + *lockspace = (void *)local_id; | |
5840 | + | |
5841 | + return 0; | |
5842 | + | |
5843 | + out_reg: | |
5844 | + kcl_unregister_service(ls->ls_local_id); | |
5845 | + | |
5846 | + out_idtblfree: | |
5847 | + free_lockidtbl(ls); | |
5848 | + | |
5849 | + out_htfree: | |
5850 | + kfree(ls->ls_reshashtbl); | |
5851 | + | |
5852 | + out_lsfree: | |
5853 | + free_ls(ls); | |
5854 | + | |
5855 | + out: | |
5856 | + return error; | |
5857 | +} | |
5858 | + | |
5859 | +/* | |
5860 | + * Called by a system like GFS which wants independent lock spaces. | |
5861 | + */ | |
5862 | + | |
5863 | +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags) | |
5864 | +{ | |
5865 | + int error = -ENOSYS; | |
5866 | + | |
5867 | + down(&gdlmstate_lock); | |
5868 | + | |
5869 | + error = init_internal(); | |
5870 | + if (error) | |
5871 | + goto out; | |
5872 | + | |
5873 | + error = new_lockspace(name, namelen, lockspace, flags); | |
5874 | + | |
5875 | + out: | |
5876 | + up(&gdlmstate_lock); | |
5877 | + | |
5878 | + return error; | |
5879 | +} | |
5880 | + | |
5881 | +/* Return 1 if the lockspace still has active remote locks, | |
5882 | + * 2 if the lockspace still has active local locks. | |
5883 | + */ | |
5884 | +static int lockspace_busy(gd_ls_t *ls) | |
5885 | +{ | |
5886 | + int i; | |
5887 | + int lkb_found = 0; | |
5888 | + gd_lkb_t *lkb; | |
5889 | + | |
5890 | + /* NOTE: We check the lockidtbl here rather than the resource table. | |
5891 | + * This is because there may be LKBs queued as ASTs that have been unlinked | |
5892 | + * from their RSBs and are pending deletion once the AST has been delivered | |
5893 | + */ | |
5894 | + read_lock(&ls->ls_lockidtbl_lock); | |
5895 | + for (i = 0; i < ls->ls_lockidtbl_size; i++) { | |
5896 | + if (!list_empty(&ls->ls_lockidtbl[i].list)) { | |
5897 | + lkb_found = 1; | |
5898 | + list_for_each_entry(lkb, &ls->ls_lockidtbl[i].list, lkb_idtbl_list) { | |
5899 | + if (!lkb->lkb_nodeid) { | |
5900 | + read_unlock(&ls->ls_lockidtbl_lock); | |
5901 | + return 2; | |
5902 | + } | |
5903 | + } | |
5904 | + } | |
5905 | + } | |
5906 | + read_unlock(&ls->ls_lockidtbl_lock); | |
5907 | + return lkb_found; | |
5908 | +} | |
5909 | + | |
5910 | +/* Actually release the lockspace */ | |
5911 | +static int release_lockspace(gd_ls_t *ls, int force) | |
5912 | +{ | |
5913 | + gd_lkb_t *lkb; | |
5914 | + gd_res_t *rsb; | |
5915 | + gd_recover_t *gr; | |
5916 | + gd_csb_t *csb; | |
5917 | + struct list_head *head; | |
5918 | + int i; | |
5919 | + int busy = lockspace_busy(ls); | |
5920 | + | |
5921 | + /* Don't destroy a busy lockspace */ | |
5922 | + if (busy > force) | |
5923 | + return -EBUSY; | |
5924 | + | |
5925 | + if (force < 3) { | |
5926 | + kcl_leave_service(ls->ls_local_id); | |
5927 | + kcl_unregister_service(ls->ls_local_id); | |
5928 | + } | |
5929 | + | |
5930 | + spin_lock(&lslist_lock); | |
5931 | + list_del(&ls->ls_list); | |
5932 | + spin_unlock(&lslist_lock); | |
5933 | + | |
5934 | + /* | |
5935 | + * Free resdata structs. | |
5936 | + */ | |
5937 | + | |
5938 | + resdir_clear(ls); | |
5939 | + | |
5940 | + /* | |
5941 | + * Free all lkb's on lockidtbl[] lists. | |
5942 | + */ | |
5943 | + | |
5944 | + for (i = 0; i < ls->ls_lockidtbl_size; i++) { | |
5945 | + head = &ls->ls_lockidtbl[i].list; | |
5946 | + while (!list_empty(head)) { | |
5947 | + lkb = list_entry(head->next, gd_lkb_t, lkb_idtbl_list); | |
5948 | + list_del(&lkb->lkb_idtbl_list); | |
5949 | + | |
5950 | + if (lkb->lkb_lockqueue_state) | |
5951 | + remove_from_lockqueue(lkb); | |
5952 | + | |
5cdbd17b | 5953 | + if (lkb->lkb_astflags & (AST_COMP | AST_BAST)) |
4bf12011 | 5954 | + list_del(&lkb->lkb_astqueue); |
5955 | + | |
5956 | + if (lkb->lkb_lvbptr | |
5957 | + && lkb->lkb_flags & GDLM_LKFLG_MSTCPY) | |
5958 | + free_lvb(lkb->lkb_lvbptr); | |
5959 | + | |
5960 | + free_lkb(lkb); | |
5961 | + } | |
5962 | + } | |
5963 | + | |
5964 | + /* | |
5965 | + * Free lkidtbl[] itself | |
5966 | + */ | |
5967 | + | |
5968 | + kfree(ls->ls_lockidtbl); | |
5969 | + | |
5970 | + /* | |
5971 | + * Free all rsb's on reshashtbl[] lists | |
5972 | + */ | |
5973 | + | |
5974 | + for (i = 0; i < ls->ls_hashsize; i++) { | |
5975 | + head = &ls->ls_reshashtbl[i]; | |
5976 | + while (!list_empty(head)) { | |
5977 | + rsb = list_entry(head->next, gd_res_t, res_hashchain); | |
5978 | + list_del(&rsb->res_hashchain); | |
5979 | + | |
5980 | + if (rsb->res_lvbptr) | |
5981 | + free_lvb(rsb->res_lvbptr); | |
5982 | + | |
5983 | + free_rsb(rsb); | |
5984 | + } | |
5985 | + } | |
5986 | + | |
5987 | + /* | |
5988 | + * Free reshashtbl[] itself | |
5989 | + */ | |
5990 | + | |
5991 | + kfree(ls->ls_reshashtbl); | |
5992 | + | |
5993 | + /* | |
5994 | + * Free structures on any other lists | |
5995 | + */ | |
5996 | + | |
5997 | + head = &ls->ls_recover; | |
5998 | + while (!list_empty(head)) { | |
5999 | + gr = list_entry(head->next, gd_recover_t, gr_list); | |
6000 | + list_del(&gr->gr_list); | |
6001 | + free_dlm_recover(gr); | |
6002 | + } | |
6003 | + | |
6004 | + head = &ls->ls_nodes; | |
6005 | + while (!list_empty(head)) { | |
6006 | + csb = list_entry(head->next, gd_csb_t, csb_list); | |
6007 | + list_del(&csb->csb_list); | |
6008 | + release_csb(csb); | |
6009 | + } | |
6010 | + | |
6011 | + head = &ls->ls_nodes_gone; | |
6012 | + while (!list_empty(head)) { | |
6013 | + csb = list_entry(head->next, gd_csb_t, csb_list); | |
6014 | + list_del(&csb->csb_list); | |
6015 | + release_csb(csb); | |
6016 | + } | |
6017 | + | |
6018 | + free_ls(ls); | |
6019 | + | |
6020 | + dlm_release(); | |
6021 | + | |
6022 | + module_put(THIS_MODULE); | |
6023 | + return 0; | |
6024 | +} | |
6025 | + | |
6026 | + | |
6027 | +/* | |
6028 | + * Called when a system has released all its locks and is not going to use the | |
6029 | + * lockspace any longer. We blindly free everything we're managing for this | |
6030 | + * lockspace. Remaining nodes will go through the recovery process as if we'd | |
6031 | + * died. The lockspace must continue to function as usual, participating in | |
6032 | + * recoveries, until kcl_leave_service returns. | |
6033 | + * | |
6034 | + * Force has 4 possible values: | |
6035 | + * 0 - don't destroy locksapce if it has any LKBs | |
6036 | + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs | |
6037 | + * 2 - destroy lockspace regardless of LKBs | |
6038 | + * 3 - destroy lockspace as part of a forced shutdown | |
6039 | + */ | |
6040 | + | |
6041 | +int dlm_release_lockspace(void *lockspace, int force) | |
6042 | +{ | |
6043 | + gd_ls_t *ls; | |
6044 | + | |
6045 | + ls = find_lockspace_by_local_id(lockspace); | |
6046 | + if (!ls) | |
6047 | + return -EINVAL; | |
6048 | + | |
6049 | + return release_lockspace(ls, force); | |
6050 | +} | |
6051 | + | |
6052 | + | |
6053 | +/* Called when the cluster is being shut down dirtily */ | |
6054 | +void dlm_emergency_shutdown() | |
6055 | +{ | |
6056 | + gd_ls_t *ls; | |
6057 | + gd_ls_t *tmp; | |
6058 | + | |
6059 | + /* Shut lowcomms down to prevent any socket activity */ | |
6060 | + lowcomms_stop_accept(); | |
6061 | + | |
6062 | + /* Delete the devices that belong the the userland | |
6063 | + lockspaces to be deleted. */ | |
6064 | + dlm_device_free_devices(); | |
6065 | + | |
6066 | + /* Now try to clean the lockspaces */ | |
6067 | + spin_lock(&lslist_lock); | |
6068 | + | |
6069 | + list_for_each_entry_safe(ls, tmp, &lslist, ls_list) { | |
6070 | + spin_unlock(&lslist_lock); | |
6071 | + release_lockspace(ls, 3); | |
6072 | + spin_lock(&lslist_lock); | |
6073 | + } | |
6074 | + | |
6075 | + spin_unlock(&lslist_lock); | |
6076 | +} | |
6077 | + | |
6078 | +gd_recover_t *allocate_dlm_recover(void) | |
6079 | +{ | |
6080 | + gd_recover_t *gr; | |
6081 | + | |
6082 | + gr = (gd_recover_t *) kmalloc(sizeof(gd_recover_t), GFP_KERNEL); | |
6083 | + if (gr) | |
6084 | + memset(gr, 0, sizeof(gd_recover_t)); | |
6085 | + | |
6086 | + return gr; | |
6087 | +} | |
6088 | + | |
6089 | +void free_dlm_recover(gd_recover_t * gr) | |
6090 | +{ | |
6091 | + kfree(gr); | |
6092 | +} | |
6093 | + | |
6094 | +/* | |
6095 | + * Called by CMAN on a specific ls. "stop" means set flag which while set | |
6096 | + * causes all new requests to ls to be queued and not submitted until flag is | |
6097 | + * cleared. stop on a ls also needs to cancel any prior starts on the ls. | |
6098 | + * The recoverd thread carries out any work called for by this event. | |
6099 | + */ | |
6100 | + | |
6101 | +static int dlm_ls_stop(void *servicedata) | |
6102 | +{ | |
6103 | + gd_ls_t *ls = (gd_ls_t *) servicedata; | |
6104 | + int new; | |
6105 | + | |
6106 | + spin_lock(&ls->ls_recover_lock); | |
6107 | + ls->ls_last_stop = ls->ls_last_start; | |
6108 | + set_bit(LSFL_LS_STOP, &ls->ls_flags); | |
6109 | + new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags); | |
6110 | + spin_unlock(&ls->ls_recover_lock); | |
6111 | + | |
6112 | + /* | |
6113 | + * This in_recovery lock does two things: | |
6114 | + * | |
6115 | + * 1) Keeps this function from returning until all threads are out | |
6116 | + * of locking routines and locking is truely stopped. | |
6117 | + * 2) Keeps any new requests from being processed until it's unlocked | |
6118 | + * when recovery is complete. | |
6119 | + */ | |
6120 | + | |
6121 | + if (new) | |
6122 | + down_write(&ls->ls_in_recovery); | |
6123 | + | |
6124 | + clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags); | |
6125 | + clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags); | |
6126 | + clear_bit(LSFL_NODES_VALID, &ls->ls_flags); | |
6127 | + clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags); | |
6128 | + | |
6129 | + recoverd_kick(ls); | |
6130 | + | |
6131 | + return 0; | |
6132 | +} | |
6133 | + | |
6134 | +/* | |
6135 | + * Called by CMAN on a specific ls. "start" means enable the lockspace to do | |
6136 | + * request processing which first requires that the recovery procedure be | |
6137 | + * stepped through with all nodes sharing the lockspace (nodeids). The first | |
6138 | + * start on the ls after it's created is a special case and requires some extra | |
6139 | + * work like figuring out our own local nodeid. We can't do all this in the | |
6140 | + * calling CMAN context, so we must pass this work off to the recoverd thread | |
6141 | + * which was created in gdlm_init(). The recoverd thread carries out any work | |
6142 | + * called for by this event. | |
6143 | + */ | |
6144 | + | |
6145 | +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count, | |
6146 | + int event_id, int type) | |
6147 | +{ | |
6148 | + gd_ls_t *ls = (gd_ls_t *) servicedata; | |
6149 | + gd_recover_t *gr; | |
6150 | + int error = -ENOMEM; | |
6151 | + | |
6152 | + gr = allocate_dlm_recover(); | |
6153 | + if (!gr) | |
6154 | + goto out; | |
6155 | + | |
6156 | + gr->gr_nodeids = nodeids; | |
6157 | + gr->gr_node_count = count; | |
6158 | + gr->gr_event_id = event_id; | |
6159 | + | |
6160 | + spin_lock(&ls->ls_recover_lock); | |
6161 | + ls->ls_last_start = event_id; | |
6162 | + list_add_tail(&gr->gr_list, &ls->ls_recover); | |
6163 | + set_bit(LSFL_LS_START, &ls->ls_flags); | |
6164 | + spin_unlock(&ls->ls_recover_lock); | |
6165 | + | |
6166 | + recoverd_kick(ls); | |
6167 | + error = 0; | |
6168 | + | |
6169 | + out: | |
6170 | + return error; | |
6171 | +} | |
6172 | + | |
6173 | +/* | |
6174 | + * Called by CMAN on a specific ls. "finish" means that all nodes which | |
6175 | + * received a "start" have completed the start and called kcl_start_done. | |
6176 | + * The recoverd thread carries out any work called for by this event. | |
6177 | + */ | |
6178 | + | |
6179 | +static void dlm_ls_finish(void *servicedata, int event_id) | |
6180 | +{ | |
6181 | + gd_ls_t *ls = (gd_ls_t *) servicedata; | |
6182 | + | |
6183 | + spin_lock(&ls->ls_recover_lock); | |
6184 | + ls->ls_last_finish = event_id; | |
6185 | + set_bit(LSFL_LS_FINISH, &ls->ls_flags); | |
6186 | + spin_unlock(&ls->ls_recover_lock); | |
6187 | + | |
6188 | + recoverd_kick(ls); | |
6189 | +} | |
6190 | + | |
6191 | +struct kcl_service_ops ls_ops = { | |
6192 | + .stop = dlm_ls_stop, | |
6193 | + .start = dlm_ls_start, | |
6194 | + .finish = dlm_ls_finish | |
6195 | +}; | |
6196 | diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h | |
6197 | --- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 6198 | +++ linux-patched/cluster/dlm/lockspace.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 6199 | @@ -0,0 +1,29 @@ |
6200 | +/****************************************************************************** | |
6201 | +******************************************************************************* | |
6202 | +** | |
6203 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
6204 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
6205 | +** | |
6206 | +** This copyrighted material is made available to anyone wishing to use, | |
6207 | +** modify, copy, or redistribute it subject to the terms and conditions | |
6208 | +** of the GNU General Public License v.2. | |
6209 | +** | |
6210 | +******************************************************************************* | |
6211 | +******************************************************************************/ | |
6212 | + | |
6213 | +#ifndef __LOCKSPACE_DOT_H__ | |
6214 | +#define __LOCKSPACE_DOT_H__ | |
6215 | + | |
6216 | +void dlm_lockspace_init(void); | |
6217 | +int dlm_init(void); | |
6218 | +int dlm_release(void); | |
6219 | +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags); | |
6220 | +int dlm_release_lockspace(void *ls, int force); | |
6221 | +gd_ls_t *find_lockspace_by_global_id(uint32_t id); | |
6222 | +gd_ls_t *find_lockspace_by_local_id(void *id); | |
6223 | +gd_ls_t *find_lockspace_by_name(char *name, int namelen); | |
6224 | +void free_dlm_recover(gd_recover_t *gr); | |
6225 | +int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out); | |
6226 | +void dlm_emergency_shutdown(void); | |
6227 | + | |
6228 | +#endif /* __LOCKSPACE_DOT_H__ */ | |
6229 | diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c | |
6230 | --- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 6231 | +++ linux-patched/cluster/dlm/lowcomms.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 6232 | @@ -0,0 +1,1354 @@ |
6233 | +/****************************************************************************** | |
6234 | +******************************************************************************* | |
6235 | +** | |
6236 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
6237 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
6238 | +** | |
6239 | +** This copyrighted material is made available to anyone wishing to use, | |
6240 | +** modify, copy, or redistribute it subject to the terms and conditions | |
6241 | +** of the GNU General Public License v.2. | |
6242 | +** | |
6243 | +******************************************************************************* | |
6244 | +******************************************************************************/ | |
6245 | + | |
6246 | +/* | |
6247 | + * lowcomms.c | |
6248 | + * | |
6249 | + * This is the "low-level" comms layer. | |
6250 | + * | |
6251 | + * It is responsible for sending/receiving messages | |
6252 | + * from other nodes in the cluster. | |
6253 | + * | |
6254 | + * Cluster nodes are referred to by their nodeids. nodeids are | |
6255 | + * simply 32 bit numbers to the locking module - if they need to | |
6256 | + * be expanded for the cluster infrastructure then that is it's | |
6257 | + * responsibility. It is this layer's | |
6258 | + * responsibility to resolve these into IP address or | |
6259 | + * whatever it needs for inter-node communication. | |
6260 | + * | |
6261 | + * The comms level is two kernel threads that deal mainly with | |
6262 | + * the receiving of messages from other nodes and passing them | |
6263 | + * up to the mid-level comms layer (which understands the | |
6264 | + * message format) for execution by the locking core, and | |
6265 | + * a send thread which does all the setting up of connections | |
6266 | + * to remote nodes and the sending of data. Threads are not allowed | |
6267 | + * to send their own data because it may cause them to wait in times | |
6268 | + * of high load. Also, this way, the sending thread can collect together | |
6269 | + * messages bound for one node and send them in one block. | |
6270 | + * | |
6271 | + * I don't see any problem with the recv thread executing the locking | |
6272 | + * code on behalf of remote processes as the locking code is | |
6273 | + * short, efficient and never waits. | |
6274 | + * | |
6275 | + */ | |
6276 | + | |
6277 | + | |
6278 | +#include <asm/ioctls.h> | |
6279 | +#include <net/sock.h> | |
6280 | +#include <net/tcp.h> | |
6281 | +#include <linux/pagemap.h> | |
6282 | +#include <cluster/cnxman.h> | |
6283 | + | |
6284 | +#include "dlm_internal.h" | |
6285 | +#include "lowcomms.h" | |
6286 | +#include "midcomms.h" | |
6287 | +#include "config.h" | |
6288 | + | |
6289 | +struct cbuf { | |
6290 | + unsigned base; | |
6291 | + unsigned len; | |
6292 | + unsigned mask; | |
6293 | +}; | |
6294 | + | |
6295 | +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0) | |
6296 | +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) | |
6297 | +#define CBUF_EMPTY(cb) ((cb)->len == 0) | |
6298 | +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) | |
6299 | +#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \ | |
6300 | + (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0) | |
6301 | +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) | |
6302 | + | |
6303 | +struct connection { | |
6304 | + struct socket *sock; /* NULL if not connected */ | |
6305 | + uint32_t nodeid; /* So we know who we are in the list */ | |
6306 | + struct rw_semaphore sock_sem; /* Stop connect races */ | |
6307 | + struct list_head read_list; /* On this list when ready for reading */ | |
6308 | + struct list_head write_list; /* On this list when ready for writing */ | |
6309 | + struct list_head state_list; /* On this list when ready to connect */ | |
6310 | + unsigned long flags; /* bit 1,2 = We are on the read/write lists */ | |
6311 | +#define CF_READ_PENDING 1 | |
6312 | +#define CF_WRITE_PENDING 2 | |
6313 | +#define CF_CONNECT_PENDING 3 | |
6314 | +#define CF_IS_OTHERSOCK 4 | |
6315 | + struct list_head writequeue; /* List of outgoing writequeue_entries */ | |
6316 | + struct list_head listenlist; /* List of allocated listening sockets */ | |
6317 | + spinlock_t writequeue_lock; | |
6318 | + int (*rx_action) (struct connection *); /* What to do when active */ | |
6319 | + struct page *rx_page; | |
6320 | + struct cbuf cb; | |
6321 | + int retries; | |
6322 | +#define MAX_CONNECT_RETRIES 3 | |
6323 | + struct connection *othersock; | |
6324 | +}; | |
6325 | +#define sock2con(x) ((struct connection *)(x)->sk_user_data) | |
6326 | +#define nodeid2con(x) (&connections[(x)]) | |
6327 | + | |
6328 | +/* An entry waiting to be sent */ | |
6329 | +struct writequeue_entry { | |
6330 | + struct list_head list; | |
6331 | + struct page *page; | |
6332 | + int offset; | |
6333 | + int len; | |
6334 | + int end; | |
6335 | + int users; | |
6336 | + struct connection *con; | |
6337 | +}; | |
6338 | + | |
6339 | +/* "Template" structure for IPv4 and IPv6 used to fill | |
6340 | + * in the missing bits when converting between cman (which knows | |
6341 | + * nothing about sockaddr structs) and real life where we actually | |
6342 | + * have to connect to these addresses. Also one of these structs | |
6343 | + * will hold the cached "us" address. | |
6344 | + * | |
6345 | + * It's an in6 sockaddr just so there's enough space for anything | |
6346 | + * we're likely to see here. | |
6347 | + */ | |
6348 | +static struct sockaddr_in6 local_addr; | |
6349 | + | |
6350 | +/* Manage daemons */ | |
6351 | +static struct semaphore thread_lock; | |
6352 | +static struct completion thread_completion; | |
6353 | +static atomic_t send_run; | |
6354 | +static atomic_t recv_run; | |
6355 | + | |
6356 | +/* An array of connections, indexed by NODEID */ | |
6357 | +static struct connection *connections; | |
6358 | +static int conn_array_size; | |
6359 | +static atomic_t writequeue_length; | |
6360 | +static atomic_t accepting; | |
6361 | + | |
6362 | +static wait_queue_t lowcomms_send_waitq_head; | |
6363 | +static wait_queue_head_t lowcomms_send_waitq; | |
6364 | + | |
6365 | +static wait_queue_t lowcomms_recv_waitq_head; | |
6366 | +static wait_queue_head_t lowcomms_recv_waitq; | |
6367 | + | |
6368 | +/* List of sockets that have reads pending */ | |
6369 | +static struct list_head read_sockets; | |
6370 | +static spinlock_t read_sockets_lock; | |
6371 | + | |
6372 | +/* List of sockets which have writes pending */ | |
6373 | +static struct list_head write_sockets; | |
6374 | +static spinlock_t write_sockets_lock; | |
6375 | + | |
6376 | +/* List of sockets which have connects pending */ | |
6377 | +static struct list_head state_sockets; | |
6378 | +static spinlock_t state_sockets_lock; | |
6379 | + | |
6380 | +/* List of allocated listen sockets */ | |
6381 | +static struct list_head listen_sockets; | |
6382 | + | |
6383 | +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr); | |
6384 | +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len); | |
6385 | + | |
6386 | + | |
6387 | +/* Data available on socket or listen socket received a connect */ | |
6388 | +static void lowcomms_data_ready(struct sock *sk, int count_unused) | |
6389 | +{ | |
6390 | + struct connection *con = sock2con(sk); | |
6391 | + | |
6392 | + if (test_and_set_bit(CF_READ_PENDING, &con->flags)) | |
6393 | + return; | |
6394 | + | |
6395 | + spin_lock_bh(&read_sockets_lock); | |
6396 | + list_add_tail(&con->read_list, &read_sockets); | |
6397 | + spin_unlock_bh(&read_sockets_lock); | |
6398 | + | |
6399 | + wake_up_interruptible(&lowcomms_recv_waitq); | |
6400 | +} | |
6401 | + | |
6402 | +static void lowcomms_write_space(struct sock *sk) | |
6403 | +{ | |
6404 | + struct connection *con = sock2con(sk); | |
6405 | + | |
6406 | + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags)) | |
6407 | + return; | |
6408 | + | |
6409 | + spin_lock_bh(&write_sockets_lock); | |
6410 | + list_add_tail(&con->write_list, &write_sockets); | |
6411 | + spin_unlock_bh(&write_sockets_lock); | |
6412 | + | |
6413 | + wake_up_interruptible(&lowcomms_send_waitq); | |
6414 | +} | |
6415 | + | |
6416 | +static inline void lowcomms_connect_sock(struct connection *con) | |
6417 | +{ | |
6418 | + if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) | |
6419 | + return; | |
6420 | + if (!atomic_read(&accepting)) | |
6421 | + return; | |
6422 | + | |
6423 | + spin_lock_bh(&state_sockets_lock); | |
6424 | + list_add_tail(&con->state_list, &state_sockets); | |
6425 | + spin_unlock_bh(&state_sockets_lock); | |
6426 | + | |
6427 | + wake_up_interruptible(&lowcomms_send_waitq); | |
6428 | +} | |
6429 | + | |
6430 | +static void lowcomms_state_change(struct sock *sk) | |
6431 | +{ | |
6432 | +/* struct connection *con = sock2con(sk); */ | |
6433 | + | |
6434 | + switch (sk->sk_state) { | |
6435 | + case TCP_ESTABLISHED: | |
6436 | + lowcomms_write_space(sk); | |
6437 | + break; | |
6438 | + | |
6439 | + case TCP_FIN_WAIT1: | |
6440 | + case TCP_FIN_WAIT2: | |
6441 | + case TCP_TIME_WAIT: | |
6442 | + case TCP_CLOSE: | |
6443 | + case TCP_CLOSE_WAIT: | |
6444 | + case TCP_LAST_ACK: | |
6445 | + case TCP_CLOSING: | |
6446 | + /* FIXME: I think this causes more trouble than it solves. | |
6447 | + lowcomms wil reconnect anyway when there is something to | |
6448 | + send. This just attempts reconnection if a node goes down! | |
6449 | + */ | |
6450 | + /* lowcomms_connect_sock(con); */ | |
6451 | + break; | |
6452 | + | |
6453 | + default: | |
6454 | + printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state); | |
6455 | + break; | |
6456 | + } | |
6457 | +} | |
6458 | + | |
6459 | +/* Make a socket active */ | |
6460 | +static int add_sock(struct socket *sock, struct connection *con) | |
6461 | +{ | |
6462 | + con->sock = sock; | |
6463 | + | |
6464 | + /* Install a data_ready callback */ | |
6465 | + con->sock->sk->sk_data_ready = lowcomms_data_ready; | |
6466 | + con->sock->sk->sk_write_space = lowcomms_write_space; | |
6467 | + con->sock->sk->sk_state_change = lowcomms_state_change; | |
6468 | + | |
6469 | + return 0; | |
6470 | +} | |
6471 | + | |
6472 | +/* Add the port number to an IP6 or 4 sockaddr and return the address | |
6473 | + length */ | |
6474 | +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port, | |
6475 | + int *addr_len) | |
6476 | +{ | |
6477 | + saddr->sin6_family = local_addr.sin6_family; | |
6478 | + if (local_addr.sin6_family == AF_INET) { | |
6479 | + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; | |
6480 | + in4_addr->sin_port = cpu_to_be16(port); | |
6481 | + *addr_len = sizeof(struct sockaddr_in); | |
6482 | + } | |
6483 | + else { | |
6484 | + saddr->sin6_port = cpu_to_be16(port); | |
6485 | + *addr_len = sizeof(struct sockaddr_in6); | |
6486 | + } | |
6487 | +} | |
6488 | + | |
6489 | +/* Close a remote connection and tidy up */ | |
6490 | +static void close_connection(struct connection *con) | |
6491 | +{ | |
6492 | + if (test_bit(CF_IS_OTHERSOCK, &con->flags)) | |
6493 | + return; | |
6494 | + | |
6495 | + down_write(&con->sock_sem); | |
6496 | + | |
6497 | + if (con->sock) { | |
6498 | + sock_release(con->sock); | |
6499 | + con->sock = NULL; | |
6500 | + if (con->othersock) { | |
6501 | + down_write(&con->othersock->sock_sem); | |
6502 | + sock_release(con->othersock->sock); | |
6503 | + con->othersock->sock = NULL; | |
6504 | + up_write(&con->othersock->sock_sem); | |
6505 | + kfree(con->othersock); | |
6506 | + con->othersock = NULL; | |
6507 | + } | |
6508 | + } | |
6509 | + if (con->rx_page) { | |
6510 | + __free_page(con->rx_page); | |
6511 | + con->rx_page = NULL; | |
6512 | + } | |
6513 | + up_write(&con->sock_sem); | |
6514 | +} | |
6515 | + | |
6516 | +/* Data received from remote end */ | |
6517 | +static int receive_from_sock(struct connection *con) | |
6518 | +{ | |
6519 | + int ret = 0; | |
6520 | + struct msghdr msg; | |
6521 | + struct iovec iov[2]; | |
6522 | + mm_segment_t fs; | |
6523 | + unsigned len; | |
6524 | + int r; | |
6525 | + int call_again_soon = 0; | |
6526 | + | |
6527 | + down_read(&con->sock_sem); | |
6528 | + | |
6529 | + if (con->sock == NULL) | |
6530 | + goto out; | |
6531 | + if (con->rx_page == NULL) { | |
6532 | + /* | |
6533 | + * This doesn't need to be atomic, but I think it should | |
6534 | + * improve performance if it is. | |
6535 | + */ | |
6536 | + con->rx_page = alloc_page(GFP_ATOMIC); | |
6537 | + if (con->rx_page == NULL) | |
6538 | + goto out_resched; | |
6539 | + CBUF_INIT(&con->cb, PAGE_CACHE_SIZE); | |
6540 | + } | |
6541 | + /* | |
6542 | + * To avoid doing too many short reads, we will reschedule for another | |
6543 | + * another time if there are less than 32 bytes left in the buffer. | |
6544 | + */ | |
6545 | + if (!CBUF_MAY_ADD(&con->cb, 32)) | |
6546 | + goto out_resched; | |
6547 | + | |
6548 | + msg.msg_control = NULL; | |
6549 | + msg.msg_controllen = 0; | |
6550 | + msg.msg_iovlen = 1; | |
6551 | + msg.msg_iov = iov; | |
6552 | + msg.msg_name = NULL; | |
6553 | + msg.msg_namelen = 0; | |
6554 | + msg.msg_flags = 0; | |
6555 | + | |
6556 | + /* | |
6557 | + * iov[0] is the bit of the circular buffer between the current end | |
6558 | + * point (cb.base + cb.len) and the end of the buffer. | |
6559 | + */ | |
6560 | + iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb); | |
6561 | + iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb); | |
6562 | + iov[1].iov_len = 0; | |
6563 | + | |
6564 | + /* | |
6565 | + * iov[1] is the bit of the circular buffer between the start of the | |
6566 | + * buffer and the start of the currently used section (cb.base) | |
6567 | + */ | |
6568 | + if (CBUF_DATA(&con->cb) >= con->cb.base) { | |
6569 | + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb); | |
6570 | + iov[1].iov_len = con->cb.base; | |
6571 | + iov[1].iov_base = page_address(con->rx_page); | |
6572 | + msg.msg_iovlen = 2; | |
6573 | + } | |
6574 | + len = iov[0].iov_len + iov[1].iov_len; | |
6575 | + | |
6576 | + fs = get_fs(); | |
6577 | + set_fs(get_ds()); | |
6578 | + r = ret = sock_recvmsg(con->sock, &msg, len, | |
6579 | + MSG_DONTWAIT | MSG_NOSIGNAL); | |
6580 | + set_fs(fs); | |
6581 | + | |
6582 | + if (ret <= 0) | |
6583 | + goto out_close; | |
6584 | + if (ret == len) | |
6585 | + call_again_soon = 1; | |
6586 | + CBUF_ADD(&con->cb, ret); | |
6587 | + ret = midcomms_process_incoming_buffer(con->nodeid, | |
6588 | + page_address(con->rx_page), | |
6589 | + con->cb.base, con->cb.len, | |
6590 | + PAGE_CACHE_SIZE); | |
6591 | + if (ret == -EBADMSG) { | |
6592 | + printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, " | |
6593 | + "iov_len=%u, iov_base[0]=%p, read=%d\n", | |
6594 | + page_address(con->rx_page), con->cb.base, con->cb.len, | |
6595 | + len, iov[0].iov_base, r); | |
6596 | + } | |
6597 | + if (ret < 0) | |
6598 | + goto out_close; | |
6599 | + CBUF_EAT(&con->cb, ret); | |
6600 | + | |
6601 | + if (CBUF_EMPTY(&con->cb) && !call_again_soon) { | |
6602 | + __free_page(con->rx_page); | |
6603 | + con->rx_page = NULL; | |
6604 | + } | |
6605 | + out: | |
6606 | + if (call_again_soon) | |
6607 | + goto out_resched; | |
6608 | + up_read(&con->sock_sem); | |
6609 | + ret = 0; | |
6610 | + goto out_ret; | |
6611 | + | |
6612 | + out_resched: | |
6613 | + lowcomms_data_ready(con->sock->sk, 0); | |
6614 | + up_read(&con->sock_sem); | |
6615 | + ret = 0; | |
6616 | + goto out_ret; | |
6617 | + | |
6618 | + out_close: | |
6619 | + up_read(&con->sock_sem); | |
6620 | + if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) { | |
6621 | + close_connection(con); | |
6622 | + lowcomms_connect_sock(con); | |
6623 | + } | |
6624 | + | |
6625 | + out_ret: | |
6626 | + return ret; | |
6627 | +} | |
6628 | + | |
6629 | +/* Listening socket is busy, accept a connection */ | |
6630 | +static int accept_from_sock(struct connection *con) | |
6631 | +{ | |
6632 | + int result; | |
6633 | + struct sockaddr_in6 peeraddr; | |
6634 | + struct socket *newsock; | |
6635 | + int len; | |
6636 | + int nodeid; | |
6637 | + struct connection *newcon; | |
6638 | + | |
6639 | + memset(&peeraddr, 0, sizeof(peeraddr)); | |
6640 | + newsock = sock_alloc(); | |
6641 | + if (!newsock) | |
6642 | + return -ENOMEM; | |
6643 | + | |
6644 | + down_read(&con->sock_sem); | |
6645 | + | |
6646 | + result = -ENOTCONN; | |
6647 | + if (con->sock == NULL) | |
6648 | + goto accept_err; | |
6649 | + | |
6650 | + newsock->type = con->sock->type; | |
6651 | + newsock->ops = con->sock->ops; | |
6652 | + | |
6653 | + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); | |
6654 | + if (result < 0) | |
6655 | + goto accept_err; | |
6656 | + | |
6657 | + /* Get the connected socket's peer */ | |
6658 | + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, | |
6659 | + &len, 2)) { | |
6660 | + result = -ECONNABORTED; | |
6661 | + goto accept_err; | |
6662 | + } | |
6663 | + | |
6664 | + /* Get the new node's NODEID */ | |
6665 | + nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len); | |
6666 | + if (nodeid == 0) { | |
6667 | + printk("dlm: connect from non cluster node\n"); | |
6668 | + sock_release(newsock); | |
6669 | + up_read(&con->sock_sem); | |
6670 | + return -1; | |
6671 | + } | |
6672 | + | |
6673 | + log_print("got connection from %d", nodeid); | |
6674 | + | |
6675 | + /* Check to see if we already have a connection to this node. This | |
6676 | + * could happen if the two nodes initiate a connection at roughly | |
6677 | + * the same time and the connections cross on the wire. | |
6678 | + * TEMPORARY FIX: | |
6679 | + * In this case we store the incoming one in "othersock" | |
6680 | + */ | |
6681 | + newcon = nodeid2con(nodeid); | |
6682 | + down_write(&newcon->sock_sem); | |
6683 | + if (newcon->sock) { | |
6684 | + struct connection *othercon; | |
6685 | + | |
6686 | + othercon = kmalloc(sizeof(struct connection), GFP_KERNEL); | |
6687 | + if (!othercon) { | |
6688 | + printk("dlm: failed to allocate incoming socket\n"); | |
6689 | + sock_release(newsock); | |
6690 | + up_write(&newcon->sock_sem); | |
6691 | + up_read(&con->sock_sem); | |
6692 | + goto accept_out; | |
6693 | + } | |
6694 | + memset(othercon, 0, sizeof(*othercon)); | |
6695 | + newcon->othersock = othercon; | |
6696 | + othercon->nodeid = nodeid; | |
6697 | + othercon->sock = newsock; | |
6698 | + othercon->rx_action = receive_from_sock; | |
6699 | + add_sock(newsock, othercon); | |
6700 | + init_rwsem(&othercon->sock_sem); | |
6701 | + set_bit(CF_IS_OTHERSOCK, &othercon->flags); | |
6702 | + newsock->sk->sk_user_data = othercon; | |
6703 | + | |
6704 | + up_write(&newcon->sock_sem); | |
6705 | + lowcomms_data_ready(newsock->sk, 0); | |
6706 | + up_read(&con->sock_sem); | |
6707 | + goto accept_out; | |
6708 | + } | |
6709 | + | |
6710 | + newsock->sk->sk_user_data = newcon; | |
6711 | + newcon->rx_action = receive_from_sock; | |
6712 | + add_sock(newsock, newcon); | |
6713 | + up_write(&newcon->sock_sem); | |
6714 | + | |
6715 | + /* | |
6716 | + * Add it to the active queue in case we got data | |
6717 | + * beween processing the accept adding the socket | |
6718 | + * to the read_sockets list | |
6719 | + */ | |
6720 | + lowcomms_data_ready(newsock->sk, 0); | |
6721 | + | |
6722 | + up_read(&con->sock_sem); | |
6723 | + | |
6724 | + accept_out: | |
6725 | + return 0; | |
6726 | + | |
6727 | + accept_err: | |
6728 | + up_read(&con->sock_sem); | |
6729 | + sock_release(newsock); | |
6730 | + | |
6731 | + printk("dlm: error accepting connection from node: %d\n", result); | |
6732 | + return result; | |
6733 | +} | |
6734 | + | |
6735 | +/* Connect a new socket to its peer */ | |
6736 | +static int connect_to_sock(struct connection *con) | |
6737 | +{ | |
6738 | + int result = -EHOSTUNREACH; | |
6739 | + struct sockaddr_in6 saddr; | |
6740 | + int addr_len; | |
6741 | + struct socket *sock; | |
6742 | + | |
6743 | + if (con->nodeid == 0) { | |
6744 | + log_print("attempt to connect sock 0 foiled"); | |
6745 | + return 0; | |
6746 | + } | |
6747 | + | |
6748 | + down_write(&con->sock_sem); | |
6749 | + if (con->retries++ > MAX_CONNECT_RETRIES) | |
6750 | + goto out; | |
6751 | + | |
6752 | + // FIXME not sure this should happen, let alone like this. | |
6753 | + if (con->sock) { | |
6754 | + sock_release(con->sock); | |
6755 | + con->sock = NULL; | |
6756 | + } | |
6757 | + | |
6758 | + /* Create a socket to communicate with */ | |
6759 | + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock); | |
6760 | + if (result < 0) | |
6761 | + goto out_err; | |
6762 | + | |
6763 | + if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0) | |
6764 | + goto out_err; | |
6765 | + | |
6766 | + sock->sk->sk_user_data = con; | |
6767 | + con->rx_action = receive_from_sock; | |
6768 | + | |
6769 | + make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len); | |
6770 | + | |
6771 | + add_sock(sock, con); | |
6772 | + result = | |
6773 | + sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len, | |
6774 | + O_NONBLOCK); | |
6775 | + if (result == -EINPROGRESS) | |
6776 | + result = 0; | |
6777 | + if (result != 0) | |
6778 | + goto out_err; | |
6779 | + | |
6780 | + out: | |
6781 | + up_write(&con->sock_sem); | |
6782 | + /* | |
6783 | + * Returning an error here means we've given up trying to connect to | |
6784 | + * a remote node, otherwise we return 0 and reschedule the connetion | |
6785 | + * attempt | |
6786 | + */ | |
6787 | + return result; | |
6788 | + | |
6789 | + out_err: | |
6790 | + if (con->sock) { | |
6791 | + sock_release(con->sock); | |
6792 | + con->sock = NULL; | |
6793 | + } | |
6794 | + /* | |
6795 | + * Some errors are fatal and this list might need adjusting. For other | |
6796 | + * errors we try again until the max number of retries is reached. | |
6797 | + */ | |
6798 | + if (result != -EHOSTUNREACH && result != -ENETUNREACH && | |
6799 | + result != -ENETDOWN && result != EINVAL | |
6800 | + && result != -EPROTONOSUPPORT) { | |
6801 | + lowcomms_connect_sock(con); | |
6802 | + result = 0; | |
6803 | + } | |
6804 | + goto out; | |
6805 | +} | |
6806 | + | |
6807 | +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len) | |
6808 | +{ | |
6809 | + struct socket *sock = NULL; | |
6810 | + mm_segment_t fs; | |
6811 | + int result = 0; | |
6812 | + int one = 1; | |
6813 | + struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr; | |
6814 | + | |
6815 | + /* Create a socket to communicate with */ | |
6816 | + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock); | |
6817 | + if (result < 0) { | |
6818 | + printk("dlm: Can't create listening comms socket\n"); | |
6819 | + goto create_out; | |
6820 | + } | |
6821 | + | |
6822 | + fs = get_fs(); | |
6823 | + set_fs(get_ds()); | |
6824 | + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one)); | |
6825 | + set_fs(fs); | |
6826 | + if (result < 0) { | |
6827 | + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result); | |
6828 | + } | |
6829 | + sock->sk->sk_user_data = con; | |
6830 | + con->rx_action = accept_from_sock; | |
6831 | + con->sock = sock; | |
6832 | + | |
6833 | + /* Bind to our port */ | |
6834 | + make_sockaddr(saddr, dlm_config.tcp_port, &addr_len); | |
6835 | + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); | |
6836 | + if (result < 0) { | |
6837 | + printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port); | |
6838 | + sock_release(sock); | |
6839 | + sock = NULL; | |
6840 | + goto create_out; | |
6841 | + } | |
6842 | + | |
6843 | + fs = get_fs(); | |
6844 | + set_fs(get_ds()); | |
6845 | + | |
6846 | + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one)); | |
6847 | + set_fs(fs); | |
6848 | + if (result < 0) { | |
6849 | + printk("dlm: Set keepalive failed: %d\n", result); | |
6850 | + } | |
6851 | + | |
6852 | + result = sock->ops->listen(sock, 5); | |
6853 | + if (result < 0) { | |
6854 | + printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port); | |
6855 | + sock_release(sock); | |
6856 | + sock = NULL; | |
6857 | + goto create_out; | |
6858 | + } | |
6859 | + | |
6860 | + create_out: | |
6861 | + return sock; | |
6862 | +} | |
6863 | + | |
6864 | + | |
6865 | +/* Listen on all interfaces */ | |
6866 | +static int listen_for_all(void) | |
6867 | +{ | |
6868 | + int result = 0; | |
6869 | + int nodeid; | |
6870 | + struct socket *sock = NULL; | |
6871 | + struct list_head *addr_list; | |
6872 | + struct connection *con = nodeid2con(0); | |
6873 | + struct cluster_node_addr *node_addr; | |
6874 | + char local_addr[sizeof(struct sockaddr_in6)]; | |
6875 | + | |
6876 | + /* This will also fill in local_addr */ | |
6877 | + nodeid = lowcomms_our_nodeid(); | |
6878 | + | |
6879 | + addr_list = kcl_get_node_addresses(nodeid); | |
6880 | + if (!addr_list) { | |
6881 | + printk("dlm: cannot initialise comms layer\n"); | |
6882 | + result = -ENOTCONN; | |
6883 | + goto create_out; | |
6884 | + } | |
6885 | + | |
6886 | + list_for_each_entry(node_addr, addr_list, list) { | |
6887 | + | |
6888 | + if (!con) { | |
6889 | + con = kmalloc(sizeof(struct connection), GFP_KERNEL); | |
6890 | + if (!con) { | |
6891 | + printk("dlm: failed to allocate listen socket\n"); | |
6892 | + goto create_out; | |
6893 | + } | |
6894 | + memset(con, 0, sizeof(*con)); | |
6895 | + init_rwsem(&con->sock_sem); | |
6896 | + spin_lock_init(&con->writequeue_lock); | |
6897 | + INIT_LIST_HEAD(&con->writequeue); | |
6898 | + set_bit(CF_IS_OTHERSOCK, &con->flags); | |
6899 | + } | |
6900 | + | |
6901 | + memcpy(local_addr, node_addr->addr, node_addr->addr_len); | |
6902 | + sock = create_listen_sock(con, local_addr, | |
6903 | + node_addr->addr_len); | |
6904 | + if (sock) { | |
6905 | + add_sock(sock, con); | |
6906 | + } | |
6907 | + else { | |
6908 | + kfree(con); | |
6909 | + } | |
6910 | + | |
6911 | + /* Keep a list of dynamically allocated listening sockets | |
6912 | + so we can free them at shutdown */ | |
6913 | + if (test_bit(CF_IS_OTHERSOCK, &con->flags)) { | |
6914 | + list_add_tail(&con->listenlist, &listen_sockets); | |
6915 | + } | |
6916 | + con = NULL; | |
6917 | + } | |
6918 | + | |
6919 | + create_out: | |
6920 | + return result; | |
6921 | +} | |
6922 | + | |
6923 | + | |
6924 | + | |
6925 | +static struct writequeue_entry *new_writequeue_entry(struct connection *con, | |
6926 | + int allocation) | |
6927 | +{ | |
6928 | + struct writequeue_entry *entry; | |
6929 | + | |
6930 | + entry = kmalloc(sizeof(struct writequeue_entry), allocation); | |
6931 | + if (!entry) | |
6932 | + return NULL; | |
6933 | + | |
6934 | + entry->page = alloc_page(allocation); | |
6935 | + if (!entry->page) { | |
6936 | + kfree(entry); | |
6937 | + return NULL; | |
6938 | + } | |
6939 | + | |
6940 | + entry->offset = 0; | |
6941 | + entry->len = 0; | |
6942 | + entry->end = 0; | |
6943 | + entry->users = 0; | |
6944 | + entry->con = con; | |
6945 | + | |
6946 | + return entry; | |
6947 | +} | |
6948 | + | |
6949 | +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len, | |
6950 | + int allocation, char **ppc) | |
6951 | +{ | |
6952 | + struct connection *con = nodeid2con(nodeid); | |
6953 | + struct writequeue_entry *e; | |
6954 | + int offset = 0; | |
6955 | + int users = 0; | |
6956 | + | |
6957 | + if (!atomic_read(&accepting)) | |
6958 | + return NULL; | |
6959 | + | |
6960 | + spin_lock(&con->writequeue_lock); | |
6961 | + e = list_entry(con->writequeue.prev, struct writequeue_entry, list); | |
6962 | + if (((struct list_head *) e == &con->writequeue) || | |
6963 | + (PAGE_CACHE_SIZE - e->end < len)) { | |
6964 | + e = NULL; | |
6965 | + } else { | |
6966 | + offset = e->end; | |
6967 | + e->end += len; | |
6968 | + users = e->users++; | |
6969 | + } | |
6970 | + spin_unlock(&con->writequeue_lock); | |
6971 | + | |
6972 | + if (e) { | |
6973 | + got_one: | |
6974 | + if (users == 0) | |
6975 | + kmap(e->page); | |
6976 | + *ppc = page_address(e->page) + offset; | |
6977 | + return e; | |
6978 | + } | |
6979 | + | |
6980 | + e = new_writequeue_entry(con, allocation); | |
6981 | + if (e) { | |
6982 | + spin_lock(&con->writequeue_lock); | |
6983 | + offset = e->end; | |
6984 | + e->end += len; | |
6985 | + users = e->users++; | |
6986 | + list_add_tail(&e->list, &con->writequeue); | |
6987 | + spin_unlock(&con->writequeue_lock); | |
6988 | + atomic_inc(&writequeue_length); | |
6989 | + goto got_one; | |
6990 | + } | |
6991 | + return NULL; | |
6992 | +} | |
6993 | + | |
6994 | +void lowcomms_commit_buffer(struct writequeue_entry *e) | |
6995 | +{ | |
6996 | + struct connection *con = e->con; | |
6997 | + int users; | |
6998 | + | |
6999 | + if (!atomic_read(&accepting)) | |
7000 | + return; | |
7001 | + | |
7002 | + spin_lock(&con->writequeue_lock); | |
7003 | + users = --e->users; | |
7004 | + if (users) | |
7005 | + goto out; | |
7006 | + e->len = e->end - e->offset; | |
7007 | + kunmap(e->page); | |
7008 | + spin_unlock(&con->writequeue_lock); | |
7009 | + | |
7010 | + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) { | |
7011 | + spin_lock_bh(&write_sockets_lock); | |
7012 | + list_add_tail(&con->write_list, &write_sockets); | |
7013 | + spin_unlock_bh(&write_sockets_lock); | |
7014 | + | |
7015 | + wake_up_interruptible(&lowcomms_send_waitq); | |
7016 | + } | |
7017 | + return; | |
7018 | + | |
7019 | + out: | |
7020 | + spin_unlock(&con->writequeue_lock); | |
7021 | + return; | |
7022 | +} | |
7023 | + | |
7024 | +static void free_entry(struct writequeue_entry *e) | |
7025 | +{ | |
7026 | + __free_page(e->page); | |
7027 | + kfree(e); | |
7028 | + atomic_dec(&writequeue_length); | |
7029 | +} | |
7030 | + | |
7031 | +/* Send a message */ | |
7032 | +static int send_to_sock(struct connection *con) | |
7033 | +{ | |
7034 | + int ret = 0; | |
7035 | + ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int); | |
7036 | + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; | |
7037 | + struct writequeue_entry *e; | |
7038 | + int len, offset; | |
7039 | + | |
7040 | + down_read(&con->sock_sem); | |
7041 | + if (con->sock == NULL) | |
7042 | + goto out_connect; | |
7043 | + | |
7044 | + sendpage = con->sock->ops->sendpage; | |
7045 | + | |
7046 | + spin_lock(&con->writequeue_lock); | |
7047 | + for (;;) { | |
7048 | + e = list_entry(con->writequeue.next, struct writequeue_entry, | |
7049 | + list); | |
7050 | + if ((struct list_head *) e == &con->writequeue) | |
7051 | + break; | |
7052 | + | |
7053 | + len = e->len; | |
7054 | + offset = e->offset; | |
7055 | + BUG_ON(len == 0 && e->users == 0); | |
7056 | + spin_unlock(&con->writequeue_lock); | |
7057 | + | |
7058 | + ret = 0; | |
7059 | + if (len) { | |
7060 | + ret = sendpage(con->sock, e->page, offset, len, | |
7061 | + msg_flags); | |
7062 | + if (ret == -EAGAIN || ret == 0) | |
7063 | + goto out; | |
7064 | + if (ret <= 0) | |
7065 | + goto send_error; | |
7066 | + } | |
7067 | + | |
7068 | + spin_lock(&con->writequeue_lock); | |
7069 | + e->offset += ret; | |
7070 | + e->len -= ret; | |
7071 | + | |
7072 | + if (e->len == 0 && e->users == 0) { | |
7073 | + list_del(&e->list); | |
7074 | + free_entry(e); | |
7075 | + continue; | |
7076 | + } | |
7077 | + } | |
7078 | + spin_unlock(&con->writequeue_lock); | |
7079 | + out: | |
7080 | + up_read(&con->sock_sem); | |
7081 | + return ret; | |
7082 | + | |
7083 | + send_error: | |
7084 | + up_read(&con->sock_sem); | |
7085 | + close_connection(con); | |
7086 | + lowcomms_connect_sock(con); | |
7087 | + return ret; | |
7088 | + | |
7089 | + out_connect: | |
7090 | + up_read(&con->sock_sem); | |
7091 | + lowcomms_connect_sock(con); | |
7092 | + return 0; | |
7093 | +} | |
7094 | + | |
7095 | +/* Called from recoverd when it knows that a node has | |
7096 | + left the cluster */ | |
7097 | +int lowcomms_close(int nodeid) | |
7098 | +{ | |
7099 | + struct connection *con; | |
7100 | + | |
7101 | + if (!connections) | |
7102 | + goto out; | |
7103 | + | |
7104 | + con = nodeid2con(nodeid); | |
7105 | + if (con->sock) { | |
7106 | + close_connection(con); | |
7107 | + return 0; | |
7108 | + } | |
7109 | + | |
7110 | + out: | |
7111 | + return -1; | |
7112 | +} | |
7113 | + | |
7114 | +/* API send message call, may queue the request */ | |
7115 | +/* N.B. This is the old interface - use the new one for new calls */ | |
7116 | +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation) | |
7117 | +{ | |
7118 | + struct writequeue_entry *e; | |
7119 | + char *b; | |
7120 | + | |
7121 | + GDLM_ASSERT(nodeid < dlm_config.max_connections, | |
7122 | + printk("nodeid=%u\n", nodeid);); | |
7123 | + | |
7124 | + e = lowcomms_get_buffer(nodeid, len, allocation, &b); | |
7125 | + if (e) { | |
7126 | + memcpy(b, buf, len); | |
7127 | + lowcomms_commit_buffer(e); | |
7128 | + return 0; | |
7129 | + } | |
7130 | + return -ENOBUFS; | |
7131 | +} | |
7132 | + | |
7133 | +/* Look for activity on active sockets */ | |
7134 | +static void process_sockets(void) | |
7135 | +{ | |
7136 | + struct list_head *list; | |
7137 | + struct list_head *temp; | |
7138 | + | |
7139 | + spin_lock_bh(&read_sockets_lock); | |
7140 | + list_for_each_safe(list, temp, &read_sockets) { | |
7141 | + struct connection *con = | |
7142 | + list_entry(list, struct connection, read_list); | |
7143 | + list_del(&con->read_list); | |
7144 | + clear_bit(CF_READ_PENDING, &con->flags); | |
7145 | + | |
7146 | + spin_unlock_bh(&read_sockets_lock); | |
7147 | + | |
7148 | + con->rx_action(con); | |
7149 | + | |
7150 | + /* Don't starve out everyone else */ | |
7151 | + schedule(); | |
7152 | + spin_lock_bh(&read_sockets_lock); | |
7153 | + } | |
7154 | + spin_unlock_bh(&read_sockets_lock); | |
7155 | +} | |
7156 | + | |
7157 | +/* Try to send any messages that are pending | |
7158 | + */ | |
7159 | +static void process_output_queue(void) | |
7160 | +{ | |
7161 | + struct list_head *list; | |
7162 | + struct list_head *temp; | |
7163 | + int ret; | |
7164 | + | |
7165 | + spin_lock_bh(&write_sockets_lock); | |
7166 | + list_for_each_safe(list, temp, &write_sockets) { | |
7167 | + struct connection *con = | |
7168 | + list_entry(list, struct connection, write_list); | |
7169 | + list_del(&con->write_list); | |
7170 | + clear_bit(CF_WRITE_PENDING, &con->flags); | |
7171 | + | |
7172 | + spin_unlock_bh(&write_sockets_lock); | |
7173 | + | |
7174 | + ret = send_to_sock(con); | |
7175 | + if (ret < 0) { | |
7176 | + } | |
7177 | + spin_lock_bh(&write_sockets_lock); | |
7178 | + } | |
7179 | + spin_unlock_bh(&write_sockets_lock); | |
7180 | +} | |
7181 | + | |
7182 | +static void process_state_queue(void) | |
7183 | +{ | |
7184 | + struct list_head *list; | |
7185 | + struct list_head *temp; | |
7186 | + int ret; | |
7187 | + | |
7188 | + spin_lock_bh(&state_sockets_lock); | |
7189 | + list_for_each_safe(list, temp, &state_sockets) { | |
7190 | + struct connection *con = | |
7191 | + list_entry(list, struct connection, state_list); | |
7192 | + list_del(&con->state_list); | |
7193 | + clear_bit(CF_CONNECT_PENDING, &con->flags); | |
7194 | + spin_unlock_bh(&state_sockets_lock); | |
7195 | + | |
7196 | + ret = connect_to_sock(con); | |
7197 | + if (ret < 0) { | |
7198 | + } | |
7199 | + spin_lock_bh(&state_sockets_lock); | |
7200 | + } | |
7201 | + spin_unlock_bh(&state_sockets_lock); | |
7202 | +} | |
7203 | + | |
7204 | +/* Discard all entries on the write queues */ | |
7205 | +static void clean_writequeues(void) | |
7206 | +{ | |
7207 | + struct list_head *list; | |
7208 | + struct list_head *temp; | |
7209 | + int nodeid; | |
7210 | + | |
7211 | + for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) { | |
7212 | + struct connection *con = nodeid2con(nodeid); | |
7213 | + | |
7214 | + spin_lock(&con->writequeue_lock); | |
7215 | + list_for_each_safe(list, temp, &con->writequeue) { | |
7216 | + struct writequeue_entry *e = | |
7217 | + list_entry(list, struct writequeue_entry, list); | |
7218 | + list_del(&e->list); | |
7219 | + free_entry(e); | |
7220 | + } | |
7221 | + spin_unlock(&con->writequeue_lock); | |
7222 | + } | |
7223 | +} | |
7224 | + | |
7225 | +static int read_list_empty(void) | |
7226 | +{ | |
7227 | + int status; | |
7228 | + | |
7229 | + spin_lock_bh(&read_sockets_lock); | |
7230 | + status = list_empty(&read_sockets); | |
7231 | + spin_unlock_bh(&read_sockets_lock); | |
7232 | + | |
7233 | + return status; | |
7234 | +} | |
7235 | + | |
7236 | +/* DLM Transport comms receive daemon */ | |
7237 | +static int dlm_recvd(void *data) | |
7238 | +{ | |
7239 | + daemonize("dlm_recvd"); | |
7240 | + atomic_set(&recv_run, 1); | |
7241 | + | |
7242 | + init_waitqueue_head(&lowcomms_recv_waitq); | |
7243 | + init_waitqueue_entry(&lowcomms_recv_waitq_head, current); | |
7244 | + add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head); | |
7245 | + | |
7246 | + complete(&thread_completion); | |
7247 | + | |
7248 | + while (atomic_read(&recv_run)) { | |
7249 | + | |
7250 | + set_task_state(current, TASK_INTERRUPTIBLE); | |
7251 | + | |
7252 | + if (read_list_empty()) | |
7253 | + schedule(); | |
7254 | + | |
7255 | + set_task_state(current, TASK_RUNNING); | |
7256 | + | |
7257 | + process_sockets(); | |
7258 | + } | |
7259 | + | |
7260 | + down(&thread_lock); | |
7261 | + up(&thread_lock); | |
7262 | + | |
7263 | + complete(&thread_completion); | |
7264 | + | |
7265 | + return 0; | |
7266 | +} | |
7267 | + | |
7268 | +static int write_and_state_lists_empty(void) | |
7269 | +{ | |
7270 | + int status; | |
7271 | + | |
7272 | + spin_lock_bh(&write_sockets_lock); | |
7273 | + status = list_empty(&write_sockets); | |
7274 | + spin_unlock_bh(&write_sockets_lock); | |
7275 | + | |
7276 | + spin_lock_bh(&state_sockets_lock); | |
7277 | + if (list_empty(&state_sockets) == 0) | |
7278 | + status = 0; | |
7279 | + spin_unlock_bh(&state_sockets_lock); | |
7280 | + | |
7281 | + return status; | |
7282 | +} | |
7283 | + | |
7284 | +/* DLM Transport send daemon */ | |
7285 | +static int dlm_sendd(void *data) | |
7286 | +{ | |
7287 | + daemonize("dlm_sendd"); | |
7288 | + atomic_set(&send_run, 1); | |
7289 | + | |
7290 | + init_waitqueue_head(&lowcomms_send_waitq); | |
7291 | + init_waitqueue_entry(&lowcomms_send_waitq_head, current); | |
7292 | + add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head); | |
7293 | + | |
7294 | + complete(&thread_completion); | |
7295 | + | |
7296 | + while (atomic_read(&send_run)) { | |
7297 | + | |
7298 | + set_task_state(current, TASK_INTERRUPTIBLE); | |
7299 | + | |
7300 | + if (write_and_state_lists_empty()) | |
7301 | + schedule(); | |
7302 | + | |
7303 | + set_task_state(current, TASK_RUNNING); | |
7304 | + | |
7305 | + process_state_queue(); | |
7306 | + process_output_queue(); | |
7307 | + } | |
7308 | + | |
7309 | + down(&thread_lock); | |
7310 | + up(&thread_lock); | |
7311 | + | |
7312 | + complete(&thread_completion); | |
7313 | + | |
7314 | + return 0; | |
7315 | +} | |
7316 | + | |
7317 | +static void daemons_stop(void) | |
7318 | +{ | |
7319 | + if (atomic_read(&recv_run)) { | |
7320 | + down(&thread_lock); | |
7321 | + atomic_set(&recv_run, 0); | |
7322 | + wake_up_interruptible(&lowcomms_recv_waitq); | |
7323 | + up(&thread_lock); | |
7324 | + wait_for_completion(&thread_completion); | |
7325 | + } | |
7326 | + | |
7327 | + if (atomic_read(&send_run)) { | |
7328 | + down(&thread_lock); | |
7329 | + atomic_set(&send_run, 0); | |
7330 | + wake_up_interruptible(&lowcomms_send_waitq); | |
7331 | + up(&thread_lock); | |
7332 | + wait_for_completion(&thread_completion); | |
7333 | + } | |
7334 | +} | |
7335 | + | |
7336 | +static int daemons_start(void) | |
7337 | +{ | |
7338 | + int error; | |
7339 | + | |
7340 | + error = kernel_thread(dlm_recvd, NULL, 0); | |
7341 | + if (error < 0) { | |
7342 | + log_print("can't start recvd thread: %d", error); | |
7343 | + goto out; | |
7344 | + } | |
7345 | + wait_for_completion(&thread_completion); | |
7346 | + | |
7347 | + error = kernel_thread(dlm_sendd, NULL, 0); | |
7348 | + if (error < 0) { | |
7349 | + log_print("can't start sendd thread: %d", error); | |
7350 | + daemons_stop(); | |
7351 | + goto out; | |
7352 | + } | |
7353 | + wait_for_completion(&thread_completion); | |
7354 | + | |
7355 | + error = 0; | |
7356 | + out: | |
7357 | + return error; | |
7358 | +} | |
7359 | + | |
7360 | +/* | |
7361 | + * Return the largest buffer size we can cope with. | |
7362 | + */ | |
7363 | +int lowcomms_max_buffer_size(void) | |
7364 | +{ | |
7365 | + return PAGE_CACHE_SIZE; | |
7366 | +} | |
7367 | + | |
7368 | +void lowcomms_stop(void) | |
7369 | +{ | |
7370 | + int i; | |
7371 | + struct connection *temp; | |
7372 | + struct connection *lcon; | |
7373 | + | |
7374 | + atomic_set(&accepting, 0); | |
7375 | + | |
7376 | + /* Set all the activity flags to prevent any | |
7377 | + socket activity. | |
7378 | + */ | |
7379 | + for (i = 0; i < conn_array_size; i++) { | |
7380 | + connections[i].flags = 0x7; | |
7381 | + } | |
7382 | + daemons_stop(); | |
7383 | + clean_writequeues(); | |
7384 | + | |
7385 | + for (i = 0; i < conn_array_size; i++) { | |
7386 | + close_connection(nodeid2con(i)); | |
7387 | + } | |
7388 | + | |
7389 | + kfree(connections); | |
7390 | + connections = NULL; | |
7391 | + | |
7392 | + /* Free up any dynamically allocated listening sockets */ | |
7393 | + list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) { | |
7394 | + sock_release(lcon->sock); | |
7395 | + kfree(lcon); | |
7396 | + } | |
7397 | + | |
7398 | + kcl_releaseref_cluster(); | |
7399 | +} | |
7400 | + | |
7401 | +/* This is quite likely to sleep... */ | |
7402 | +int lowcomms_start(void) | |
7403 | +{ | |
7404 | + int error = 0; | |
7405 | + int i; | |
7406 | + | |
7407 | + INIT_LIST_HEAD(&read_sockets); | |
7408 | + INIT_LIST_HEAD(&write_sockets); | |
7409 | + INIT_LIST_HEAD(&state_sockets); | |
7410 | + INIT_LIST_HEAD(&listen_sockets); | |
7411 | + | |
7412 | + spin_lock_init(&read_sockets_lock); | |
7413 | + spin_lock_init(&write_sockets_lock); | |
7414 | + spin_lock_init(&state_sockets_lock); | |
7415 | + | |
7416 | + init_completion(&thread_completion); | |
7417 | + init_MUTEX(&thread_lock); | |
7418 | + atomic_set(&send_run, 0); | |
7419 | + atomic_set(&recv_run, 0); | |
7420 | + | |
7421 | + error = -ENOTCONN; | |
7422 | + if (kcl_addref_cluster()) | |
7423 | + goto out; | |
7424 | + | |
7425 | + /* | |
7426 | + * Temporarily initialise the waitq head so that lowcomms_send_message | |
7427 | + * doesn't crash if it gets called before the thread is fully | |
7428 | + * initialised | |
7429 | + */ | |
7430 | + init_waitqueue_head(&lowcomms_send_waitq); | |
7431 | + | |
7432 | + error = -ENOMEM; | |
7433 | + | |
7434 | + connections = kmalloc(sizeof(struct connection) * | |
7435 | + dlm_config.max_connections, GFP_KERNEL); | |
7436 | + if (!connections) | |
7437 | + goto out; | |
7438 | + | |
7439 | + memset(connections, 0, | |
7440 | + sizeof(struct connection) * dlm_config.max_connections); | |
7441 | + for (i = 0; i < dlm_config.max_connections; i++) { | |
7442 | + connections[i].nodeid = i; | |
7443 | + init_rwsem(&connections[i].sock_sem); | |
7444 | + INIT_LIST_HEAD(&connections[i].writequeue); | |
7445 | + spin_lock_init(&connections[i].writequeue_lock); | |
7446 | + } | |
7447 | + conn_array_size = dlm_config.max_connections; | |
7448 | + | |
7449 | + /* Start listening */ | |
7450 | + error = listen_for_all(); | |
7451 | + if (error) | |
7452 | + goto fail_free_conn; | |
7453 | + | |
7454 | + error = daemons_start(); | |
7455 | + if (error) | |
7456 | + goto fail_free_conn; | |
7457 | + | |
7458 | + atomic_set(&accepting, 1); | |
7459 | + | |
7460 | + return 0; | |
7461 | + | |
7462 | + fail_free_conn: | |
7463 | + kfree(connections); | |
7464 | + | |
7465 | + out: | |
7466 | + return error; | |
7467 | +} | |
7468 | + | |
7469 | +/* Don't accept any more outgoing work */ | |
7470 | +void lowcomms_stop_accept() | |
7471 | +{ | |
7472 | + atomic_set(&accepting, 0); | |
7473 | +} | |
7474 | + | |
7475 | +/* Cluster Manager interface functions for looking up | |
7476 | + nodeids and IP addresses by each other | |
7477 | +*/ | |
7478 | + | |
7479 | +/* Return the IP address of a node given its NODEID */ | |
7480 | +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr) | |
7481 | +{ | |
7482 | + struct list_head *addrs; | |
7483 | + struct cluster_node_addr *node_addr; | |
7484 | + struct cluster_node_addr *current_addr = NULL; | |
7485 | + struct sockaddr_in6 *saddr; | |
7486 | + int interface; | |
7487 | + int i; | |
7488 | + | |
7489 | + addrs = kcl_get_node_addresses(nodeid); | |
7490 | + if (!addrs) | |
7491 | + return -1; | |
7492 | + | |
7493 | + interface = kcl_get_current_interface(); | |
7494 | + | |
7495 | + /* Look for address number <interface> */ | |
7496 | + i=0; /* i/f numbers start at 1 */ | |
7497 | + list_for_each_entry(node_addr, addrs, list) { | |
7498 | + if (interface == ++i) { | |
7499 | + current_addr = node_addr; | |
7500 | + break; | |
7501 | + } | |
7502 | + } | |
7503 | + | |
7504 | + /* If that failed then just use the first one */ | |
7505 | + if (!current_addr) | |
7506 | + current_addr = (struct cluster_node_addr *)addrs->next; | |
7507 | + | |
7508 | + saddr = (struct sockaddr_in6 *)current_addr->addr; | |
7509 | + | |
7510 | + /* Extract the IP address */ | |
7511 | + if (saddr->sin6_family == AF_INET) { | |
7512 | + struct sockaddr_in *in4 = (struct sockaddr_in *)saddr; | |
7513 | + struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr; | |
7514 | + ret4->sin_addr.s_addr = in4->sin_addr.s_addr; | |
7515 | + } | |
7516 | + else { | |
7517 | + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr; | |
7518 | + memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr)); | |
7519 | + } | |
7520 | + | |
7521 | + return 0; | |
7522 | +} | |
7523 | + | |
7524 | +/* Return the NODEID for a node given its sockaddr */ | |
7525 | +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len) | |
7526 | +{ | |
7527 | + struct kcl_cluster_node node; | |
7528 | + struct sockaddr_in6 ipv6_addr; | |
7529 | + struct sockaddr_in ipv4_addr; | |
7530 | + | |
7531 | + if (addr->sa_family == AF_INET) { | |
7532 | + struct sockaddr_in *in4 = (struct sockaddr_in *)addr; | |
7533 | + memcpy(&ipv4_addr, &local_addr, addr_len); | |
7534 | + memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr)); | |
7535 | + | |
7536 | + addr = (struct sockaddr *)&ipv4_addr; | |
7537 | + } | |
7538 | + else { | |
7539 | + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; | |
7540 | + memcpy(&ipv6_addr, &local_addr, addr_len); | |
7541 | + memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr)); | |
7542 | + | |
7543 | + addr = (struct sockaddr *)&ipv6_addr; | |
7544 | + } | |
7545 | + | |
7546 | + if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0) | |
7547 | + return node.node_id; | |
7548 | + else | |
7549 | + return 0; | |
7550 | +} | |
7551 | + | |
7552 | +int lowcomms_our_nodeid(void) | |
7553 | +{ | |
7554 | + struct kcl_cluster_node node; | |
7555 | + struct list_head *addrs; | |
7556 | + struct cluster_node_addr *first_addr; | |
7557 | + static int our_nodeid = 0; | |
7558 | + | |
7559 | + if (our_nodeid) | |
7560 | + return our_nodeid; | |
7561 | + | |
7562 | + if (kcl_get_node_by_nodeid(0, &node) == -1) | |
7563 | + return 0; | |
7564 | + | |
7565 | + our_nodeid = node.node_id; | |
7566 | + | |
7567 | + /* Fill in the "template" structure */ | |
7568 | + addrs = kcl_get_node_addresses(our_nodeid); | |
7569 | + if (!addrs) | |
7570 | + return 0; | |
7571 | + | |
7572 | + first_addr = (struct cluster_node_addr *) addrs->next; | |
7573 | + memcpy(&local_addr, &first_addr->addr, first_addr->addr_len); | |
7574 | + | |
7575 | + return node.node_id; | |
7576 | +} | |
7577 | +/* | |
7578 | + * Overrides for Emacs so that we follow Linus's tabbing style. | |
7579 | + * Emacs will notice this stuff at the end of the file and automatically | |
7580 | + * adjust the settings for this buffer only. This must remain at the end | |
7581 | + * of the file. | |
7582 | + * --------------------------------------------------------------------------- | |
7583 | + * Local variables: | |
7584 | + * c-file-style: "linux" | |
7585 | + * End: | |
7586 | + */ | |
7587 | diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h | |
7588 | --- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 7589 | +++ linux-patched/cluster/dlm/lowcomms.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 7590 | @@ -0,0 +1,34 @@ |
7591 | +/****************************************************************************** | |
7592 | +******************************************************************************* | |
7593 | +** | |
7594 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
7595 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
7596 | +** | |
7597 | +** This copyrighted material is made available to anyone wishing to use, | |
7598 | +** modify, copy, or redistribute it subject to the terms and conditions | |
7599 | +** of the GNU General Public License v.2. | |
7600 | +** | |
7601 | +******************************************************************************* | |
7602 | +******************************************************************************/ | |
7603 | + | |
7604 | +#ifndef __LOWCOMMS_DOT_H__ | |
7605 | +#define __LOWCOMMS_DOT_H__ | |
7606 | + | |
7607 | +/* The old interface */ | |
7608 | +int lowcomms_send_message(int csid, char *buf, int len, int allocation); | |
7609 | + | |
7610 | +/* The new interface */ | |
7611 | +struct writequeue_entry; | |
7612 | +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len, | |
7613 | + int allocation, char **ppc); | |
7614 | +extern void lowcomms_commit_buffer(struct writequeue_entry *e); | |
7615 | + | |
7616 | +int lowcomms_start(void); | |
7617 | +void lowcomms_stop(void); | |
7618 | +void lowcomms_stop_accept(void); | |
7619 | +int lowcomms_close(int nodeid); | |
7620 | +int lowcomms_max_buffer_size(void); | |
7621 | + | |
7622 | +int lowcomms_our_nodeid(void); | |
7623 | + | |
7624 | +#endif /* __LOWCOMMS_DOT_H__ */ | |
7625 | diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c | |
7626 | --- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 7627 | +++ linux-patched/cluster/dlm/main.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 7628 | @@ -0,0 +1,98 @@ |
7629 | +/****************************************************************************** | |
7630 | +******************************************************************************* | |
7631 | +** | |
7632 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
7633 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
7634 | +** | |
7635 | +** This copyrighted material is made available to anyone wishing to use, | |
7636 | +** modify, copy, or redistribute it subject to the terms and conditions | |
7637 | +** of the GNU General Public License v.2. | |
7638 | +** | |
7639 | +******************************************************************************* | |
7640 | +******************************************************************************/ | |
7641 | + | |
7642 | +#define EXPORT_SYMTAB | |
7643 | + | |
7644 | +#include <linux/init.h> | |
7645 | +#include <linux/proc_fs.h> | |
7646 | +#include <linux/ctype.h> | |
7647 | +#include <linux/seq_file.h> | |
7648 | +#include <linux/module.h> | |
7649 | +#include <net/sock.h> | |
7650 | + | |
7651 | +#include <cluster/cnxman.h> | |
7652 | + | |
7653 | +#include "dlm_internal.h" | |
7654 | +#include "lockspace.h" | |
7655 | +#include "recoverd.h" | |
7656 | +#include "ast.h" | |
7657 | +#include "lkb.h" | |
7658 | +#include "nodes.h" | |
7659 | +#include "locking.h" | |
7660 | +#include "config.h" | |
7661 | +#include "memory.h" | |
7662 | +#include "recover.h" | |
7663 | +#include "lowcomms.h" | |
7664 | + | |
7665 | +int dlm_device_init(void); | |
7666 | +void dlm_device_exit(void); | |
7667 | +void dlm_proc_init(void); | |
7668 | +void dlm_proc_exit(void); | |
7669 | + | |
7670 | + | |
7671 | +/* Cluster manager callbacks, we want to know if a node dies | |
7672 | + N.B. this is independent of lockspace-specific event callbacks from SM */ | |
7673 | + | |
7674 | +static void cman_callback(kcl_callback_reason reason, long arg) | |
7675 | +{ | |
7676 | + if (reason == DIED) { | |
7677 | + lowcomms_close((int) arg); | |
7678 | + } | |
7679 | + | |
7680 | + /* This is unconditional. so do what we can to tidy up */ | |
7681 | + if (reason == LEAVING) { | |
7682 | + dlm_emergency_shutdown(); | |
7683 | + } | |
7684 | +} | |
7685 | + | |
7686 | +int __init init_dlm(void) | |
7687 | +{ | |
7688 | + dlm_proc_init(); | |
7689 | + dlm_lockspace_init(); | |
7690 | + dlm_recoverd_init(); | |
7691 | + dlm_nodes_init(); | |
7692 | + dlm_device_init(); | |
7693 | + dlm_memory_init(); | |
7694 | + dlm_config_init(); | |
7695 | + | |
7696 | + kcl_add_callback(cman_callback); | |
7697 | + | |
7698 | + printk("DLM %s (built %s %s) installed\n", | |
7699 | + DLM_RELEASE_NAME, __DATE__, __TIME__); | |
7700 | + | |
7701 | + return 0; | |
7702 | +} | |
7703 | + | |
7704 | +void __exit exit_dlm(void) | |
7705 | +{ | |
7706 | + kcl_remove_callback(cman_callback); | |
7707 | + | |
7708 | + dlm_device_exit(); | |
7709 | + dlm_memory_exit(); | |
7710 | + dlm_config_exit(); | |
7711 | + dlm_proc_exit(); | |
7712 | +} | |
7713 | + | |
7714 | +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME); | |
7715 | +MODULE_AUTHOR("Red Hat, Inc."); | |
7716 | +MODULE_LICENSE("GPL"); | |
7717 | + | |
7718 | +module_init(init_dlm); | |
7719 | +module_exit(exit_dlm); | |
7720 | + | |
7721 | +EXPORT_SYMBOL(dlm_init); | |
7722 | +EXPORT_SYMBOL(dlm_release); | |
7723 | +EXPORT_SYMBOL(dlm_new_lockspace); | |
7724 | +EXPORT_SYMBOL(dlm_release_lockspace); | |
7725 | +EXPORT_SYMBOL(dlm_lock); | |
7726 | +EXPORT_SYMBOL(dlm_unlock); | |
7727 | diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c | |
7728 | --- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 7729 | +++ linux-patched/cluster/dlm/memory.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 7730 | @@ -0,0 +1,238 @@ |
7731 | +/****************************************************************************** | |
7732 | +******************************************************************************* | |
7733 | +** | |
7734 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
7735 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
7736 | +** | |
7737 | +** This copyrighted material is made available to anyone wishing to use, | |
7738 | +** modify, copy, or redistribute it subject to the terms and conditions | |
7739 | +** of the GNU General Public License v.2. | |
7740 | +** | |
7741 | +******************************************************************************* | |
7742 | +******************************************************************************/ | |
7743 | + | |
7744 | +/* memory.c | |
7745 | + * | |
7746 | + * memory allocation routines | |
7747 | + * | |
7748 | + */ | |
7749 | + | |
7750 | +#include "dlm_internal.h" | |
7751 | +#include "memory.h" | |
7752 | +#include "config.h" | |
7753 | + | |
7754 | +/* as the man says...Shouldn't this be in a header file somewhere? */ | |
7755 | +#define BYTES_PER_WORD sizeof(void *) | |
7756 | + | |
7757 | +static kmem_cache_t *rsb_cache_small; | |
7758 | +static kmem_cache_t *rsb_cache_large; | |
7759 | +static kmem_cache_t *lkb_cache; | |
7760 | +static kmem_cache_t *lvb_cache; | |
7761 | +static kmem_cache_t *resdir_cache_large; | |
7762 | +static kmem_cache_t *resdir_cache_small; | |
7763 | + | |
7764 | +/* The thresholds above which we allocate large RSBs/resdatas rather than small | |
7765 | + * ones. This must make the resultant structure end on a word boundary */ | |
7766 | +#define LARGE_RSB_NAME 28 | |
7767 | +#define LARGE_RES_NAME 28 | |
7768 | + | |
7769 | +int dlm_memory_init() | |
7770 | +{ | |
7771 | + int ret = -ENOMEM; | |
7772 | + | |
7773 | + | |
7774 | + rsb_cache_small = | |
7775 | + kmem_cache_create("dlm_rsb(small)", | |
7776 | + (sizeof(gd_res_t) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1), | |
7777 | + __alignof__(gd_res_t), 0, NULL, NULL); | |
7778 | + if (!rsb_cache_small) | |
7779 | + goto out; | |
7780 | + | |
7781 | + rsb_cache_large = | |
7782 | + kmem_cache_create("dlm_rsb(large)", | |
7783 | + sizeof(gd_res_t) + DLM_RESNAME_MAXLEN, | |
7784 | + __alignof__(gd_res_t), 0, NULL, NULL); | |
7785 | + if (!rsb_cache_large) | |
7786 | + goto out_free_rsbs; | |
7787 | + | |
7788 | + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(gd_lkb_t), | |
7789 | + __alignof__(gd_lkb_t), 0, NULL, NULL); | |
7790 | + if (!lkb_cache) | |
7791 | + goto out_free_rsbl; | |
7792 | + | |
7793 | + resdir_cache_large = | |
7794 | + kmem_cache_create("dlm_resdir(l)", | |
7795 | + sizeof(gd_resdata_t) + DLM_RESNAME_MAXLEN, | |
7796 | + __alignof__(gd_resdata_t), 0, NULL, NULL); | |
7797 | + if (!resdir_cache_large) | |
7798 | + goto out_free_lkb; | |
7799 | + | |
7800 | + resdir_cache_small = | |
7801 | + kmem_cache_create("dlm_resdir(s)", | |
7802 | + (sizeof(gd_resdata_t) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1), | |
7803 | + __alignof__(gd_resdata_t), 0, NULL, NULL); | |
7804 | + if (!resdir_cache_small) | |
7805 | + goto out_free_resl; | |
7806 | + | |
7807 | + /* LVB cache also holds ranges, so should be 64bit aligned */ | |
7808 | + lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN, | |
7809 | + __alignof__(uint64_t), 0, NULL, NULL); | |
7810 | + if (!lkb_cache) | |
7811 | + goto out_free_ress; | |
7812 | + | |
7813 | + ret = 0; | |
7814 | + goto out; | |
7815 | + | |
7816 | + out_free_ress: | |
7817 | + kmem_cache_destroy(resdir_cache_small); | |
7818 | + | |
7819 | + out_free_resl: | |
7820 | + kmem_cache_destroy(resdir_cache_large); | |
7821 | + | |
7822 | + out_free_lkb: | |
7823 | + kmem_cache_destroy(lkb_cache); | |
7824 | + | |
7825 | + out_free_rsbl: | |
7826 | + kmem_cache_destroy(rsb_cache_large); | |
7827 | + | |
7828 | + out_free_rsbs: | |
7829 | + kmem_cache_destroy(rsb_cache_small); | |
7830 | + | |
7831 | + out: | |
7832 | + return ret; | |
7833 | +} | |
7834 | + | |
7835 | +void dlm_memory_exit() | |
7836 | +{ | |
7837 | + kmem_cache_destroy(rsb_cache_large); | |
7838 | + kmem_cache_destroy(rsb_cache_small); | |
7839 | + kmem_cache_destroy(lkb_cache); | |
7840 | + kmem_cache_destroy(resdir_cache_small); | |
7841 | + kmem_cache_destroy(resdir_cache_large); | |
7842 | + kmem_cache_destroy(lvb_cache); | |
7843 | +} | |
7844 | + | |
7845 | +gd_res_t *allocate_rsb(gd_ls_t *ls, int namelen) | |
7846 | +{ | |
7847 | + gd_res_t *r; | |
7848 | + | |
7849 | + GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); | |
7850 | + | |
7851 | + if (namelen >= LARGE_RSB_NAME) | |
7852 | + r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation); | |
7853 | + else | |
7854 | + r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation); | |
7855 | + | |
7856 | + if (r) | |
7857 | + memset(r, 0, sizeof(gd_res_t) + namelen); | |
7858 | + | |
7859 | + return r; | |
7860 | +} | |
7861 | + | |
7862 | +void free_rsb(gd_res_t *r) | |
7863 | +{ | |
7864 | + int length = r->res_length; | |
7865 | + | |
7866 | +#ifdef POISON | |
7867 | + memset(r, 0x55, sizeof(gd_res_t) + r->res_length); | |
7868 | +#endif | |
7869 | + | |
7870 | + if (length >= LARGE_RSB_NAME) | |
7871 | + kmem_cache_free(rsb_cache_large, r); | |
7872 | + else | |
7873 | + kmem_cache_free(rsb_cache_small, r); | |
7874 | +} | |
7875 | + | |
7876 | +gd_lkb_t *allocate_lkb(gd_ls_t *ls) | |
7877 | +{ | |
7878 | + gd_lkb_t *l; | |
7879 | + | |
7880 | + l = kmem_cache_alloc(lkb_cache, ls->ls_allocation); | |
7881 | + if (l) | |
7882 | + memset(l, 0, sizeof(gd_lkb_t)); | |
7883 | + | |
7884 | + return l; | |
7885 | +} | |
7886 | + | |
7887 | +void free_lkb(gd_lkb_t *l) | |
7888 | +{ | |
7889 | +#ifdef POISON | |
7890 | + memset(l, 0xAA, sizeof(gd_lkb_t)); | |
7891 | +#endif | |
7892 | + kmem_cache_free(lkb_cache, l); | |
7893 | +} | |
7894 | + | |
7895 | +gd_resdata_t *allocate_resdata(gd_ls_t *ls, int namelen) | |
7896 | +{ | |
7897 | + gd_resdata_t *rd; | |
7898 | + | |
7899 | + GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); | |
7900 | + | |
7901 | + if (namelen >= LARGE_RES_NAME) | |
7902 | + rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation); | |
7903 | + else | |
7904 | + rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation); | |
7905 | + | |
7906 | + if (rd) | |
7907 | + memset(rd, 0, sizeof(gd_resdata_t)); | |
7908 | + | |
7909 | + return rd; | |
7910 | +} | |
7911 | + | |
7912 | +void free_resdata(gd_resdata_t *rd) | |
7913 | +{ | |
7914 | + if (rd->rd_length >= LARGE_RES_NAME) | |
7915 | + kmem_cache_free(resdir_cache_large, rd); | |
7916 | + else | |
7917 | + kmem_cache_free(resdir_cache_small, rd); | |
7918 | +} | |
7919 | + | |
7920 | +char *allocate_lvb(gd_ls_t *ls) | |
7921 | +{ | |
7922 | + char *l; | |
7923 | + | |
7924 | + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation); | |
7925 | + if (l) | |
7926 | + memset(l, 0, DLM_LVB_LEN); | |
7927 | + | |
7928 | + return l; | |
7929 | +} | |
7930 | + | |
7931 | +void free_lvb(char *l) | |
7932 | +{ | |
7933 | + kmem_cache_free(lvb_cache, l); | |
7934 | +} | |
7935 | + | |
7936 | +/* Ranges are allocated from the LVB cache as they are the same size (4x64 | |
7937 | + * bits) */ | |
7938 | +uint64_t *allocate_range(gd_ls_t * ls) | |
7939 | +{ | |
7940 | + uint64_t *l; | |
7941 | + | |
7942 | + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation); | |
7943 | + if (l) | |
7944 | + memset(l, 0, DLM_LVB_LEN); | |
7945 | + | |
7946 | + return l; | |
7947 | +} | |
7948 | + | |
7949 | +void free_range(uint64_t *l) | |
7950 | +{ | |
7951 | + kmem_cache_free(lvb_cache, l); | |
7952 | +} | |
7953 | + | |
7954 | +gd_rcom_t *allocate_rcom_buffer(gd_ls_t *ls) | |
7955 | +{ | |
7956 | + gd_rcom_t *rc; | |
7957 | + | |
7958 | + rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation); | |
7959 | + if (rc) | |
7960 | + memset(rc, 0, dlm_config.buffer_size); | |
7961 | + | |
7962 | + return rc; | |
7963 | +} | |
7964 | + | |
7965 | +void free_rcom_buffer(gd_rcom_t *rc) | |
7966 | +{ | |
7967 | + kfree(rc); | |
7968 | +} | |
7969 | diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h | |
7970 | --- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 7971 | +++ linux-patched/cluster/dlm/memory.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 7972 | @@ -0,0 +1,32 @@ |
7973 | +/****************************************************************************** | |
7974 | +******************************************************************************* | |
7975 | +** | |
7976 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
7977 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
7978 | +** | |
7979 | +** This copyrighted material is made available to anyone wishing to use, | |
7980 | +** modify, copy, or redistribute it subject to the terms and conditions | |
7981 | +** of the GNU General Public License v.2. | |
7982 | +** | |
7983 | +******************************************************************************* | |
7984 | +******************************************************************************/ | |
7985 | + | |
7986 | +#ifndef __MEMORY_DOT_H__ | |
7987 | +#define __MEMORY_DOT_H__ | |
7988 | + | |
7989 | +int dlm_memory_init(void); | |
7990 | +void dlm_memory_exit(void); | |
7991 | +gd_res_t *allocate_rsb(gd_ls_t * ls, int namelen); | |
7992 | +void free_rsb(gd_res_t * r); | |
7993 | +gd_lkb_t *allocate_lkb(gd_ls_t * ls); | |
7994 | +void free_lkb(gd_lkb_t * l); | |
7995 | +gd_resdata_t *allocate_resdata(gd_ls_t * ls, int namelen); | |
7996 | +void free_resdata(gd_resdata_t * rd); | |
7997 | +char *allocate_lvb(gd_ls_t * ls); | |
7998 | +void free_lvb(char *l); | |
7999 | +gd_rcom_t *allocate_rcom_buffer(gd_ls_t * ls); | |
8000 | +void free_rcom_buffer(gd_rcom_t * rc); | |
8001 | +uint64_t *allocate_range(gd_ls_t * ls); | |
8002 | +void free_range(uint64_t * l); | |
8003 | + | |
8004 | +#endif /* __MEMORY_DOT_H__ */ | |
8005 | diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c | |
8006 | --- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 8007 | +++ linux-patched/cluster/dlm/midcomms.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 8008 | @@ -0,0 +1,351 @@ |
8009 | +/****************************************************************************** | |
8010 | +******************************************************************************* | |
8011 | +** | |
8012 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
8013 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
8014 | +** | |
8015 | +** This copyrighted material is made available to anyone wishing to use, | |
8016 | +** modify, copy, or redistribute it subject to the terms and conditions | |
8017 | +** of the GNU General Public License v.2. | |
8018 | +** | |
8019 | +******************************************************************************* | |
8020 | +******************************************************************************/ | |
8021 | + | |
8022 | +/* | |
8023 | + * midcomms.c | |
8024 | + * | |
8025 | + * This is the appallingly named "mid-level" comms layer. | |
8026 | + * | |
8027 | + * Its purpose is to take packets from the "real" comms layer, | |
8028 | + * split them up into packets and pass them to the interested | |
8029 | + * part of the locking mechanism. | |
8030 | + * | |
8031 | + * It also takes messages from the locking layer, formats them | |
8032 | + * into packets and sends them to the comms layer. | |
8033 | + * | |
8034 | + * It knows the format of the mid-level messages used and nodeidss | |
8035 | + * but it does not know how to resolve a nodeid into an IP address | |
8036 | + * or any of the comms channel details | |
8037 | + * | |
8038 | + */ | |
8039 | + | |
8040 | +#include "dlm_internal.h" | |
8041 | +#include "lowcomms.h" | |
8042 | +#include "midcomms.h" | |
8043 | +#include "lockqueue.h" | |
8044 | +#include "nodes.h" | |
8045 | +#include "reccomms.h" | |
8046 | +#include "config.h" | |
8047 | + | |
8048 | +/* Byteorder routines */ | |
8049 | + | |
8050 | +static void host_to_network(void *msg) | |
8051 | +{ | |
8052 | + struct gd_req_header *head = msg; | |
8053 | + struct gd_remlockrequest *req = msg; | |
8054 | + struct gd_remlockreply *reply = msg; | |
8055 | + struct gd_remquery *query = msg; | |
8056 | + struct gd_remqueryreply *queryrep = msg; | |
8057 | + gd_rcom_t *rc = msg; | |
8058 | + | |
8059 | + /* Force into network byte order */ | |
8060 | + | |
8061 | + /* | |
8062 | + * Do the common header first | |
8063 | + */ | |
8064 | + | |
8065 | + head->rh_length = cpu_to_le16(head->rh_length); | |
8066 | + head->rh_lockspace = cpu_to_le32(head->rh_lockspace); | |
8067 | + /* Leave the lkid alone as it is transparent at the remote end */ | |
8068 | + | |
8069 | + /* | |
8070 | + * Do the fields in the remlockrequest or remlockreply structs | |
8071 | + */ | |
8072 | + | |
8073 | + switch (req->rr_header.rh_cmd) { | |
8074 | + | |
8075 | + case GDLM_REMCMD_LOCKREQUEST: | |
8076 | + case GDLM_REMCMD_CONVREQUEST: | |
8077 | + req->rr_range_start = cpu_to_le64(req->rr_range_start); | |
8078 | + req->rr_range_end = cpu_to_le64(req->rr_range_end); | |
8079 | + /* Deliberate fall through */ | |
8080 | + case GDLM_REMCMD_UNLOCKREQUEST: | |
8081 | + case GDLM_REMCMD_LOOKUP: | |
8082 | + case GDLM_REMCMD_LOCKGRANT: | |
8083 | + case GDLM_REMCMD_SENDBAST: | |
8084 | + case GDLM_REMCMD_SENDCAST: | |
8085 | + case GDLM_REMCMD_REM_RESDATA: | |
8086 | + req->rr_flags = cpu_to_le32(req->rr_flags); | |
8087 | + req->rr_status = cpu_to_le32(req->rr_status); | |
8088 | + break; | |
8089 | + | |
8090 | + case GDLM_REMCMD_LOCKREPLY: | |
8091 | + reply->rl_lockstate = cpu_to_le32(reply->rl_lockstate); | |
8092 | + reply->rl_nodeid = cpu_to_le32(reply->rl_nodeid); | |
8093 | + reply->rl_status = cpu_to_le32(reply->rl_status); | |
8094 | + break; | |
8095 | + | |
8096 | + case GDLM_REMCMD_RECOVERMESSAGE: | |
8097 | + case GDLM_REMCMD_RECOVERREPLY: | |
8098 | + rc->rc_msgid = cpu_to_le32(rc->rc_msgid); | |
8099 | + rc->rc_datalen = cpu_to_le16(rc->rc_datalen); | |
8100 | + break; | |
8101 | + | |
8102 | + case GDLM_REMCMD_QUERY: | |
8103 | + query->rq_mstlkid = cpu_to_le32(query->rq_mstlkid); | |
8104 | + query->rq_query = cpu_to_le32(query->rq_query); | |
8105 | + query->rq_maxlocks = cpu_to_le32(query->rq_maxlocks); | |
8106 | + break; | |
8107 | + | |
8108 | + case GDLM_REMCMD_QUERYREPLY: | |
8109 | + queryrep->rq_numlocks = cpu_to_le32(queryrep->rq_numlocks); | |
8110 | + queryrep->rq_status = cpu_to_le32(queryrep->rq_status); | |
8111 | + queryrep->rq_grantcount = cpu_to_le32(queryrep->rq_grantcount); | |
8112 | + queryrep->rq_waitcount = cpu_to_le32(queryrep->rq_waitcount); | |
8113 | + queryrep->rq_convcount = cpu_to_le32(queryrep->rq_convcount); | |
8114 | + break; | |
8115 | + | |
8116 | + default: | |
8117 | + printk("dlm: warning, unknown REMCMD type %u\n", | |
8118 | + req->rr_header.rh_cmd); | |
8119 | + } | |
8120 | +} | |
8121 | + | |
8122 | +static void network_to_host(void *msg) | |
8123 | +{ | |
8124 | + struct gd_req_header *head = msg; | |
8125 | + struct gd_remlockrequest *req = msg; | |
8126 | + struct gd_remlockreply *reply = msg; | |
8127 | + struct gd_remquery *query = msg; | |
8128 | + struct gd_remqueryreply *queryrep = msg; | |
8129 | + gd_rcom_t *rc = msg; | |
8130 | + | |
8131 | + /* Force into host byte order */ | |
8132 | + | |
8133 | + /* | |
8134 | + * Do the common header first | |
8135 | + */ | |
8136 | + | |
8137 | + head->rh_length = le16_to_cpu(head->rh_length); | |
8138 | + head->rh_lockspace = le32_to_cpu(head->rh_lockspace); | |
8139 | + /* Leave the lkid alone as it is transparent at the remote end */ | |
8140 | + | |
8141 | + /* | |
8142 | + * Do the fields in the remlockrequest or remlockreply structs | |
8143 | + */ | |
8144 | + | |
8145 | + switch (req->rr_header.rh_cmd) { | |
8146 | + | |
8147 | + case GDLM_REMCMD_LOCKREQUEST: | |
8148 | + case GDLM_REMCMD_CONVREQUEST: | |
8149 | + req->rr_range_start = le64_to_cpu(req->rr_range_start); | |
8150 | + req->rr_range_end = le64_to_cpu(req->rr_range_end); | |
8151 | + case GDLM_REMCMD_LOOKUP: | |
8152 | + case GDLM_REMCMD_UNLOCKREQUEST: | |
8153 | + case GDLM_REMCMD_LOCKGRANT: | |
8154 | + case GDLM_REMCMD_SENDBAST: | |
8155 | + case GDLM_REMCMD_SENDCAST: | |
8156 | + case GDLM_REMCMD_REM_RESDATA: | |
8157 | + /* Actually, not much to do here as the remote lock IDs are | |
8158 | + * transparent too */ | |
8159 | + req->rr_flags = le32_to_cpu(req->rr_flags); | |
8160 | + req->rr_status = le32_to_cpu(req->rr_status); | |
8161 | + break; | |
8162 | + | |
8163 | + case GDLM_REMCMD_LOCKREPLY: | |
8164 | + reply->rl_lockstate = le32_to_cpu(reply->rl_lockstate); | |
8165 | + reply->rl_nodeid = le32_to_cpu(reply->rl_nodeid); | |
8166 | + reply->rl_status = le32_to_cpu(reply->rl_status); | |
8167 | + break; | |
8168 | + | |
8169 | + case GDLM_REMCMD_RECOVERMESSAGE: | |
8170 | + case GDLM_REMCMD_RECOVERREPLY: | |
8171 | + rc->rc_msgid = le32_to_cpu(rc->rc_msgid); | |
8172 | + rc->rc_datalen = le16_to_cpu(rc->rc_datalen); | |
8173 | + break; | |
8174 | + | |
8175 | + | |
8176 | + case GDLM_REMCMD_QUERY: | |
8177 | + query->rq_mstlkid = le32_to_cpu(query->rq_mstlkid); | |
8178 | + query->rq_query = le32_to_cpu(query->rq_query); | |
8179 | + query->rq_maxlocks = le32_to_cpu(query->rq_maxlocks); | |
8180 | + break; | |
8181 | + | |
8182 | + case GDLM_REMCMD_QUERYREPLY: | |
8183 | + queryrep->rq_numlocks = le32_to_cpu(queryrep->rq_numlocks); | |
8184 | + queryrep->rq_status = le32_to_cpu(queryrep->rq_status); | |
8185 | + queryrep->rq_grantcount = le32_to_cpu(queryrep->rq_grantcount); | |
8186 | + queryrep->rq_waitcount = le32_to_cpu(queryrep->rq_waitcount); | |
8187 | + queryrep->rq_convcount = le32_to_cpu(queryrep->rq_convcount); | |
8188 | + break; | |
8189 | + | |
8190 | + default: | |
8191 | + printk("dlm: warning, unknown REMCMD type %u\n", | |
8192 | + req->rr_header.rh_cmd); | |
8193 | + } | |
8194 | +} | |
8195 | + | |
8196 | +static void copy_from_cb(void *dst, const void *base, unsigned offset, | |
8197 | + unsigned len, unsigned limit) | |
8198 | +{ | |
8199 | + unsigned copy = len; | |
8200 | + | |
8201 | + if ((copy + offset) > limit) | |
8202 | + copy = limit - offset; | |
8203 | + memcpy(dst, base + offset, copy); | |
8204 | + len -= copy; | |
8205 | + if (len) | |
8206 | + memcpy(dst + copy, base, len); | |
8207 | +} | |
8208 | + | |
8209 | +static void khexdump(const unsigned char *c, int len) | |
8210 | +{ | |
8211 | + while (len > 16) { | |
8212 | + printk(KERN_INFO | |
8213 | + "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n", | |
8214 | + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], | |
8215 | + c[9], c[10], c[11], c[12], c[13], c[14], c[15]); | |
8216 | + len -= 16; | |
8217 | + } | |
8218 | + while (len > 4) { | |
8219 | + printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2], | |
8220 | + c[3]); | |
8221 | + len -= 4; | |
8222 | + } | |
8223 | + while (len > 0) { | |
8224 | + printk(KERN_INFO "%02x\n", c[0]); | |
8225 | + len--; | |
8226 | + } | |
8227 | +} | |
8228 | + | |
8229 | +/* | |
8230 | + * Called from the low-level comms layer to process a buffer of | |
8231 | + * commands. | |
8232 | + * | |
8233 | + * Only complete messages are processed here, any "spare" bytes from | |
8234 | + * the end of a buffer are saved and tacked onto the front of the next | |
8235 | + * message that comes in. I doubt this will happen very often but we | |
8236 | + * need to be able to cope with it and I don't want the task to be waiting | |
8237 | + * for packets to come in when there is useful work to be done. | |
8238 | + * | |
8239 | + */ | |
8240 | +int midcomms_process_incoming_buffer(int nodeid, const void *base, | |
8241 | + unsigned offset, unsigned len, | |
8242 | + unsigned limit) | |
8243 | +{ | |
8244 | + unsigned char __tmp[sizeof(struct gd_req_header) + 64]; | |
8245 | + struct gd_req_header *msg = (struct gd_req_header *) __tmp; | |
8246 | + int ret = 0; | |
8247 | + int err = 0; | |
8248 | + unsigned msglen; | |
8249 | + __u32 id, space; | |
8250 | + | |
8251 | + while (len > sizeof(struct gd_req_header)) { | |
8252 | + /* Get message header and check it over */ | |
8253 | + copy_from_cb(msg, base, offset, sizeof(struct gd_req_header), | |
8254 | + limit); | |
8255 | + msglen = le16_to_cpu(msg->rh_length); | |
8256 | + id = msg->rh_lkid; | |
8257 | + space = msg->rh_lockspace; | |
8258 | + | |
8259 | + /* Check message size */ | |
8260 | + err = -EINVAL; | |
8261 | + if (msglen < sizeof(struct gd_req_header)) | |
8262 | + break; | |
8263 | + err = -E2BIG; | |
8264 | + if (msglen > dlm_config.buffer_size) { | |
8265 | + printk("dlm: message size too big %d\n", msglen); | |
8266 | + break; | |
8267 | + } | |
8268 | + err = 0; | |
8269 | + | |
8270 | + /* Not enough in buffer yet? wait for some more */ | |
8271 | + if (msglen > len) | |
8272 | + break; | |
8273 | + | |
8274 | + /* Make sure our temp buffer is large enough */ | |
8275 | + if (msglen > sizeof(__tmp) && | |
8276 | + msg == (struct gd_req_header *) __tmp) { | |
8277 | + msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL); | |
8278 | + if (msg == NULL) | |
8279 | + return ret; | |
8280 | + } | |
8281 | + | |
8282 | + copy_from_cb(msg, base, offset, msglen, limit); | |
8283 | + BUG_ON(id != msg->rh_lkid); | |
8284 | + BUG_ON(space != msg->rh_lockspace); | |
8285 | + ret += msglen; | |
8286 | + offset += msglen; | |
8287 | + offset &= (limit - 1); | |
8288 | + len -= msglen; | |
8289 | + network_to_host(msg); | |
8290 | + | |
8291 | + if ((msg->rh_cmd > 32) || | |
8292 | + (msg->rh_cmd == 0) || | |
8293 | + (msg->rh_length < sizeof(struct gd_req_header)) || | |
8294 | + (msg->rh_length > dlm_config.buffer_size)) { | |
8295 | + | |
8296 | + printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, " | |
8297 | + "lkid=%u, lockspace=%u\n", | |
8298 | + msg->rh_cmd, msg->rh_flags, msg->rh_length, | |
8299 | + msg->rh_lkid, msg->rh_lockspace); | |
8300 | + | |
8301 | + printk("dlm: midcomms: base=%p, offset=%u, len=%u, " | |
8302 | + "ret=%u, limit=%08x newbuf=%d\n", | |
8303 | + base, offset, len, ret, limit, | |
8304 | + ((struct gd_req_header *) __tmp == msg)); | |
8305 | + | |
8306 | + khexdump((const unsigned char *) msg, msg->rh_length); | |
8307 | + | |
8308 | + return -EBADMSG; | |
8309 | + } | |
8310 | + | |
8311 | + switch (msg->rh_cmd) { | |
8312 | + case GDLM_REMCMD_RECOVERMESSAGE: | |
8313 | + case GDLM_REMCMD_RECOVERREPLY: | |
8314 | + process_recovery_comm(nodeid, msg); | |
8315 | + break; | |
8316 | + default: | |
8317 | + process_cluster_request(nodeid, msg, FALSE); | |
8318 | + } | |
8319 | + } | |
8320 | + | |
8321 | + if (msg != (struct gd_req_header *) __tmp) | |
8322 | + kfree(msg); | |
8323 | + | |
8324 | + return err ? err : ret; | |
8325 | +} | |
8326 | + | |
8327 | +/* | |
8328 | + * Send a lowcomms buffer | |
8329 | + */ | |
8330 | + | |
8331 | +void midcomms_send_buffer(struct gd_req_header *msg, struct writequeue_entry *e) | |
8332 | +{ | |
8333 | + host_to_network(msg); | |
8334 | + lowcomms_commit_buffer(e); | |
8335 | +} | |
8336 | + | |
8337 | +/* | |
8338 | + * Make the message into network byte order and send it | |
8339 | + */ | |
8340 | + | |
8341 | +int midcomms_send_message(uint32_t nodeid, struct gd_req_header *msg, | |
8342 | + int allocation) | |
8343 | +{ | |
8344 | + int len = msg->rh_length; | |
8345 | + | |
8346 | + host_to_network(msg); | |
8347 | + | |
8348 | + /* | |
8349 | + * Loopback. In fact, the locking code pretty much prevents this from | |
8350 | + * being needed but it can happen when the directory node is also the | |
8351 | + * local node. | |
8352 | + */ | |
8353 | + | |
8354 | + if (nodeid == our_nodeid()) | |
8355 | + return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0, | |
8356 | + len, len); | |
8357 | + | |
8358 | + return lowcomms_send_message(nodeid, (char *) msg, len, allocation); | |
8359 | +} | |
8360 | diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h | |
8361 | --- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 8362 | +++ linux-patched/cluster/dlm/midcomms.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 8363 | @@ -0,0 +1,24 @@ |
8364 | +/****************************************************************************** | |
8365 | +******************************************************************************* | |
8366 | +** | |
8367 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
8368 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
8369 | +** | |
8370 | +** This copyrighted material is made available to anyone wishing to use, | |
8371 | +** modify, copy, or redistribute it subject to the terms and conditions | |
8372 | +** of the GNU General Public License v.2. | |
8373 | +** | |
8374 | +******************************************************************************* | |
8375 | +******************************************************************************/ | |
8376 | + | |
8377 | +#ifndef __MIDCOMMS_DOT_H__ | |
8378 | +#define __MIDCOMMS_DOT_H__ | |
8379 | + | |
8380 | +int midcomms_send_message(uint32_t csid, struct gd_req_header *msg, | |
8381 | + int allocation); | |
8382 | +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset, | |
8383 | + unsigned len, unsigned limit); | |
8384 | +void midcomms_send_buffer(struct gd_req_header *msg, | |
8385 | + struct writequeue_entry *e); | |
8386 | + | |
8387 | +#endif /* __MIDCOMMS_DOT_H__ */ | |
8388 | diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c | |
8389 | --- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 8390 | +++ linux-patched/cluster/dlm/nodes.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 8391 | @@ -0,0 +1,325 @@ |
8392 | +/****************************************************************************** | |
8393 | +******************************************************************************* | |
8394 | +** | |
8395 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
8396 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
8397 | +** | |
8398 | +** This copyrighted material is made available to anyone wishing to use, | |
8399 | +** modify, copy, or redistribute it subject to the terms and conditions | |
8400 | +** of the GNU General Public License v.2. | |
8401 | +** | |
8402 | +******************************************************************************* | |
8403 | +******************************************************************************/ | |
8404 | + | |
8405 | +#include <net/sock.h> | |
8406 | +#include <cluster/cnxman.h> | |
8407 | + | |
8408 | +#include "dlm_internal.h" | |
8409 | +#include "lowcomms.h" | |
8410 | +#include "nodes.h" | |
8411 | +#include "recover.h" | |
8412 | +#include "reccomms.h" | |
8413 | +#include "util.h" | |
8414 | + | |
8415 | +static struct list_head cluster_nodes; | |
8416 | +static spinlock_t node_lock; | |
8417 | +static uint32_t local_nodeid; | |
8418 | +static struct semaphore local_init_lock; | |
8419 | + | |
8420 | + | |
8421 | +void dlm_nodes_init(void) | |
8422 | +{ | |
8423 | + INIT_LIST_HEAD(&cluster_nodes); | |
8424 | + spin_lock_init(&node_lock); | |
8425 | + local_nodeid = 0; | |
8426 | + init_MUTEX(&local_init_lock); | |
8427 | +} | |
8428 | + | |
8429 | +static gd_node_t *search_node(uint32_t nodeid) | |
8430 | +{ | |
8431 | + gd_node_t *node; | |
8432 | + | |
8433 | + list_for_each_entry(node, &cluster_nodes, gn_list) { | |
8434 | + if (node->gn_nodeid == nodeid) | |
8435 | + goto out; | |
8436 | + } | |
8437 | + node = NULL; | |
8438 | + out: | |
8439 | + return node; | |
8440 | +} | |
8441 | + | |
8442 | +static void put_node(gd_node_t *node) | |
8443 | +{ | |
8444 | + spin_lock(&node_lock); | |
8445 | + node->gn_refcount--; | |
8446 | + if (node->gn_refcount == 0) { | |
8447 | + list_del(&node->gn_list); | |
8448 | + spin_unlock(&node_lock); | |
8449 | + kfree(node); | |
8450 | + return; | |
8451 | + } | |
8452 | + spin_unlock(&node_lock); | |
8453 | +} | |
8454 | + | |
8455 | +static int get_node(uint32_t nodeid, gd_node_t **ndp) | |
8456 | +{ | |
8457 | + gd_node_t *node, *node2; | |
8458 | + int error = -ENOMEM; | |
8459 | + | |
8460 | + spin_lock(&node_lock); | |
8461 | + node = search_node(nodeid); | |
8462 | + if (node) | |
8463 | + node->gn_refcount++; | |
8464 | + spin_unlock(&node_lock); | |
8465 | + | |
8466 | + if (node) | |
8467 | + goto out; | |
8468 | + | |
8469 | + node = (gd_node_t *) kmalloc(sizeof(gd_node_t), GFP_KERNEL); | |
8470 | + if (!node) | |
8471 | + goto fail; | |
8472 | + | |
8473 | + memset(node, 0, sizeof(gd_node_t)); | |
8474 | + node->gn_nodeid = nodeid; | |
8475 | + | |
8476 | + spin_lock(&node_lock); | |
8477 | + node2 = search_node(nodeid); | |
8478 | + if (node2) { | |
8479 | + node2->gn_refcount++; | |
8480 | + spin_unlock(&node_lock); | |
8481 | + kfree(node); | |
8482 | + node = node2; | |
8483 | + goto out; | |
8484 | + } | |
8485 | + | |
8486 | + node->gn_refcount = 1; | |
8487 | + list_add_tail(&node->gn_list, &cluster_nodes); | |
8488 | + spin_unlock(&node_lock); | |
8489 | + | |
8490 | + out: | |
8491 | + *ndp = node; | |
8492 | + return 0; | |
8493 | + | |
8494 | + fail: | |
8495 | + return error; | |
8496 | +} | |
8497 | + | |
8498 | +int init_new_csb(uint32_t nodeid, gd_csb_t **ret_csb) | |
8499 | +{ | |
8500 | + gd_csb_t *csb; | |
8501 | + gd_node_t *node; | |
8502 | + int error = -ENOMEM; | |
8503 | + | |
8504 | + csb = (gd_csb_t *) kmalloc(sizeof(gd_csb_t), GFP_KERNEL); | |
8505 | + if (!csb) | |
8506 | + goto fail; | |
8507 | + | |
8508 | + memset(csb, 0, sizeof(gd_csb_t)); | |
8509 | + | |
8510 | + error = get_node(nodeid, &node); | |
8511 | + if (error) | |
8512 | + goto fail_free; | |
8513 | + | |
8514 | + csb->csb_node = node; | |
8515 | + | |
8516 | + down(&local_init_lock); | |
8517 | + | |
8518 | + if (!local_nodeid) { | |
8519 | + if (nodeid == our_nodeid()) { | |
8520 | + local_nodeid = node->gn_nodeid; | |
8521 | + } | |
8522 | + } | |
8523 | + up(&local_init_lock); | |
8524 | + | |
8525 | + *ret_csb = csb; | |
8526 | + return 0; | |
8527 | + | |
8528 | + fail_free: | |
8529 | + kfree(csb); | |
8530 | + fail: | |
8531 | + return error; | |
8532 | +} | |
8533 | + | |
8534 | +void release_csb(gd_csb_t *csb) | |
8535 | +{ | |
8536 | + put_node(csb->csb_node); | |
8537 | + kfree(csb); | |
8538 | +} | |
8539 | + | |
8540 | +uint32_t our_nodeid(void) | |
8541 | +{ | |
8542 | + return lowcomms_our_nodeid(); | |
8543 | +} | |
8544 | + | |
8545 | +int nodes_reconfig_wait(gd_ls_t *ls) | |
8546 | +{ | |
8547 | + int error; | |
8548 | + | |
8549 | + if (ls->ls_low_nodeid == our_nodeid()) { | |
8550 | + error = gdlm_wait_status_all(ls, NODES_VALID); | |
8551 | + if (!error) | |
8552 | + set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags); | |
8553 | + | |
8554 | + /* Experimental: this delay should allow any final messages | |
8555 | + * from the previous node to be received before beginning | |
8556 | + * recovery. */ | |
8557 | + | |
8558 | + if (ls->ls_num_nodes == 1) { | |
8559 | + current->state = TASK_UNINTERRUPTIBLE; | |
8560 | + schedule_timeout((2) * HZ); | |
8561 | + } | |
8562 | + | |
8563 | + } else | |
8564 | + error = gdlm_wait_status_low(ls, NODES_ALL_VALID); | |
8565 | + | |
8566 | + return error; | |
8567 | +} | |
8568 | + | |
8569 | +static void add_ordered_node(gd_ls_t *ls, gd_csb_t *new) | |
8570 | +{ | |
8571 | + gd_csb_t *csb = NULL; | |
8572 | + struct list_head *tmp; | |
8573 | + struct list_head *newlist = &new->csb_list; | |
8574 | + struct list_head *head = &ls->ls_nodes; | |
8575 | + | |
8576 | + list_for_each(tmp, head) { | |
8577 | + csb = list_entry(tmp, gd_csb_t, csb_list); | |
8578 | + | |
8579 | + if (new->csb_node->gn_nodeid < csb->csb_node->gn_nodeid) | |
8580 | + break; | |
8581 | + } | |
8582 | + | |
8583 | + if (!csb) | |
8584 | + list_add_tail(newlist, head); | |
8585 | + else { | |
8586 | + /* FIXME: can use list macro here */ | |
8587 | + newlist->prev = tmp->prev; | |
8588 | + newlist->next = tmp; | |
8589 | + tmp->prev->next = newlist; | |
8590 | + tmp->prev = newlist; | |
8591 | + } | |
8592 | +} | |
8593 | + | |
8594 | +int ls_nodes_reconfig(gd_ls_t *ls, gd_recover_t *gr, int *neg_out) | |
8595 | +{ | |
8596 | + gd_csb_t *csb, *safe; | |
8597 | + int error, i, found, pos = 0, neg = 0; | |
8598 | + uint32_t low = (uint32_t) (-1); | |
8599 | + | |
8600 | + /* | |
8601 | + * Remove (and save) departed nodes from lockspace's nodes list | |
8602 | + */ | |
8603 | + | |
8604 | + list_for_each_entry_safe(csb, safe, &ls->ls_nodes, csb_list) { | |
8605 | + found = FALSE; | |
8606 | + for (i = 0; i < gr->gr_node_count; i++) { | |
8607 | + if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) { | |
8608 | + found = TRUE; | |
8609 | + break; | |
8610 | + } | |
8611 | + } | |
8612 | + | |
8613 | + if (!found) { | |
8614 | + neg++; | |
8615 | + csb->csb_gone_event = gr->gr_event_id; | |
8616 | + list_del(&csb->csb_list); | |
8617 | + list_add_tail(&csb->csb_list, &ls->ls_nodes_gone); | |
8618 | + ls->ls_num_nodes--; | |
8619 | + log_all(ls, "remove node %u", csb->csb_node->gn_nodeid); | |
8620 | + } | |
8621 | + } | |
8622 | + | |
8623 | + /* | |
8624 | + * Add new nodes to lockspace's nodes list | |
8625 | + */ | |
8626 | + | |
8627 | + for (i = 0; i < gr->gr_node_count; i++) { | |
8628 | + found = FALSE; | |
8629 | + list_for_each_entry(csb, &ls->ls_nodes, csb_list) { | |
8630 | + if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) { | |
8631 | + found = TRUE; | |
8632 | + break; | |
8633 | + } | |
8634 | + } | |
8635 | + | |
8636 | + if (!found) { | |
8637 | + pos++; | |
8638 | + | |
8639 | + error = init_new_csb(gr->gr_nodeids[i], &csb); | |
8640 | + GDLM_ASSERT(!error,); | |
8641 | + | |
8642 | + add_ordered_node(ls, csb); | |
8643 | + ls->ls_num_nodes++; | |
8644 | + log_all(ls, "add node %u", csb->csb_node->gn_nodeid); | |
8645 | + } | |
8646 | + } | |
8647 | + | |
8648 | + list_for_each_entry(csb, &ls->ls_nodes, csb_list) { | |
8649 | + if (csb->csb_node->gn_nodeid < low) | |
8650 | + low = csb->csb_node->gn_nodeid; | |
8651 | + } | |
8652 | + | |
8653 | + rcom_log_clear(ls); | |
8654 | + ls->ls_low_nodeid = low; | |
8655 | + ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1; | |
8656 | + set_bit(LSFL_NODES_VALID, &ls->ls_flags); | |
8657 | + *neg_out = neg; | |
8658 | + | |
8659 | + error = nodes_reconfig_wait(ls); | |
8660 | + | |
8661 | + log_all(ls, "total nodes %d", ls->ls_num_nodes); | |
8662 | + | |
8663 | + return error; | |
8664 | +} | |
8665 | + | |
8666 | +int ls_nodes_init(gd_ls_t *ls, gd_recover_t *gr) | |
8667 | +{ | |
8668 | + gd_csb_t *csb; | |
8669 | + int i, error; | |
8670 | + uint32_t low = (uint32_t) (-1); | |
8671 | + | |
8672 | + log_all(ls, "add nodes"); | |
8673 | + | |
8674 | + for (i = 0; i < gr->gr_node_count; i++) { | |
8675 | + error = init_new_csb(gr->gr_nodeids[i], &csb); | |
8676 | + if (error) | |
8677 | + goto fail; | |
8678 | + | |
8679 | + add_ordered_node(ls, csb); | |
8680 | + ls->ls_num_nodes++; | |
8681 | + | |
8682 | + if (csb->csb_node->gn_nodeid < low) | |
8683 | + low = csb->csb_node->gn_nodeid; | |
8684 | + } | |
8685 | + | |
8686 | + ls->ls_low_nodeid = low; | |
8687 | + ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1; | |
8688 | + set_bit(LSFL_NODES_VALID, &ls->ls_flags); | |
8689 | + | |
8690 | + error = nodes_reconfig_wait(ls); | |
8691 | + | |
8692 | + log_all(ls, "total nodes %d", ls->ls_num_nodes); | |
8693 | + | |
8694 | + return error; | |
8695 | + | |
8696 | + fail: | |
8697 | + while (!list_empty(&ls->ls_nodes)) { | |
8698 | + csb = list_entry(ls->ls_nodes.next, gd_csb_t, csb_list); | |
8699 | + list_del(&csb->csb_list); | |
8700 | + release_csb(csb); | |
8701 | + } | |
8702 | + ls->ls_num_nodes = 0; | |
8703 | + | |
8704 | + return error; | |
8705 | +} | |
8706 | + | |
8707 | +int in_nodes_gone(gd_ls_t *ls, uint32_t nodeid) | |
8708 | +{ | |
8709 | + gd_csb_t *csb; | |
8710 | + | |
8711 | + list_for_each_entry(csb, &ls->ls_nodes_gone, csb_list) { | |
8712 | + if (csb->csb_node->gn_nodeid == nodeid) | |
8713 | + return TRUE; | |
8714 | + } | |
8715 | + return FALSE; | |
8716 | +} | |
8717 | diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h | |
8718 | --- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 8719 | +++ linux-patched/cluster/dlm/nodes.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 8720 | @@ -0,0 +1,25 @@ |
8721 | +/****************************************************************************** | |
8722 | +******************************************************************************* | |
8723 | +** | |
8724 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
8725 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
8726 | +** | |
8727 | +** This copyrighted material is made available to anyone wishing to use, | |
8728 | +** modify, copy, or redistribute it subject to the terms and conditions | |
8729 | +** of the GNU General Public License v.2. | |
8730 | +** | |
8731 | +******************************************************************************* | |
8732 | +******************************************************************************/ | |
8733 | + | |
8734 | +#ifndef __NODES_DOT_H__ | |
8735 | +#define __NODES_DOT_H__ | |
8736 | + | |
8737 | +void dlm_nodes_init(void); | |
8738 | +int init_new_csb(uint32_t nodeid, gd_csb_t ** ret_csb); | |
8739 | +void release_csb(gd_csb_t * csb); | |
8740 | +uint32_t our_nodeid(void); | |
8741 | +int ls_nodes_reconfig(gd_ls_t * ls, gd_recover_t * gr, int *neg); | |
8742 | +int ls_nodes_init(gd_ls_t * ls, gd_recover_t * gr); | |
8743 | +int in_nodes_gone(gd_ls_t * ls, uint32_t nodeid); | |
8744 | + | |
8745 | +#endif /* __NODES_DOT_H__ */ | |
8746 | diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c | |
8747 | --- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 8748 | +++ linux-patched/cluster/dlm/proc.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 8749 | @@ -0,0 +1,469 @@ |
8750 | +/****************************************************************************** | |
8751 | +******************************************************************************* | |
8752 | +** | |
8753 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
8754 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
8755 | +** | |
8756 | +** This copyrighted material is made available to anyone wishing to use, | |
8757 | +** modify, copy, or redistribute it subject to the terms and conditions | |
8758 | +** of the GNU General Public License v.2. | |
8759 | +** | |
8760 | +******************************************************************************* | |
8761 | +******************************************************************************/ | |
8762 | + | |
8763 | +#include <linux/init.h> | |
8764 | +#include <linux/proc_fs.h> | |
8765 | +#include <linux/ctype.h> | |
8766 | +#include <linux/seq_file.h> | |
8767 | +#include <linux/module.h> | |
8768 | + | |
8769 | +#include "dlm_internal.h" | |
8770 | +#include "lockspace.h" | |
8771 | + | |
8772 | +#if defined(DLM_DEBUG) | |
8773 | +#define DLM_DEBUG_SIZE (1024) | |
8774 | +#define MAX_DEBUG_MSG_LEN (64) | |
8775 | +#else | |
8776 | +#define DLM_DEBUG_SIZE (0) | |
8777 | +#define MAX_DEBUG_MSG_LEN (0) | |
8778 | +#endif | |
8779 | + | |
8780 | +static char * debug_buf; | |
8781 | +static unsigned int debug_size; | |
8782 | +static unsigned int debug_point; | |
8783 | +static int debug_wrap; | |
8784 | +static spinlock_t debug_lock; | |
8785 | +static struct proc_dir_entry * debug_proc_entry = NULL; | |
8786 | +static struct proc_dir_entry * rcom_proc_entry = NULL; | |
8787 | +static char proc_ls_name[255] = ""; | |
8788 | + | |
8789 | +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS | |
8790 | +static struct proc_dir_entry * locks_proc_entry = NULL; | |
8791 | +static struct seq_operations locks_info_op; | |
8792 | + | |
8793 | + | |
8794 | +static int locks_open(struct inode *inode, struct file *file) | |
8795 | +{ | |
8796 | + return seq_open(file, &locks_info_op); | |
8797 | +} | |
8798 | + | |
8799 | +/* Write simply sets the lockspace to use */ | |
8800 | +static ssize_t locks_write(struct file *file, const char *buf, | |
8801 | + size_t count, loff_t * ppos) | |
8802 | +{ | |
8803 | + if (count < sizeof(proc_ls_name)) { | |
8804 | + copy_from_user(proc_ls_name, buf, count); | |
8805 | + proc_ls_name[count] = '\0'; | |
8806 | + | |
8807 | + /* Remove any trailing LF so that lazy users | |
8808 | + can just echo "lsname" > /proc/cluster/dlm_locks */ | |
8809 | + if (proc_ls_name[count - 1] == '\n') | |
8810 | + proc_ls_name[count - 1] = '\0'; | |
8811 | + | |
8812 | + return count; | |
8813 | + } | |
8814 | + return 0; | |
8815 | +} | |
8816 | + | |
8817 | +static struct file_operations locks_fops = { | |
8818 | + open:locks_open, | |
8819 | + write:locks_write, | |
8820 | + read:seq_read, | |
8821 | + llseek:seq_lseek, | |
8822 | + release:seq_release, | |
8823 | +}; | |
8824 | + | |
8825 | +struct ls_dumpinfo { | |
8826 | + int entry; | |
8827 | + struct list_head *next; | |
8828 | + gd_ls_t *ls; | |
8829 | + gd_res_t *rsb; | |
8830 | +}; | |
8831 | + | |
8832 | +static int print_resource(gd_res_t * res, struct seq_file *s); | |
8833 | + | |
8834 | +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di) | |
8835 | +{ | |
8836 | + read_lock(&di->ls->ls_reshash_lock); | |
8837 | + if (!di->next) { | |
8838 | + /* Find the next non-empty hash bucket */ | |
8839 | + while (list_empty(&di->ls->ls_reshashtbl[di->entry]) && | |
8840 | + di->entry < di->ls->ls_hashsize) { | |
8841 | + di->entry++; | |
8842 | + } | |
8843 | + if (di->entry >= di->ls->ls_hashsize) { | |
8844 | + read_unlock(&di->ls->ls_reshash_lock); | |
8845 | + return NULL; /* End of hash list */ | |
8846 | + } | |
8847 | + | |
8848 | + di->next = di->ls->ls_reshashtbl[di->entry].next; | |
8849 | + } else { /* Find the next entry in the list */ | |
8850 | + | |
8851 | + di->next = di->next->next; | |
8852 | + if (di->next->next == di->ls->ls_reshashtbl[di->entry].next) { | |
8853 | + /* End of list - move to next bucket */ | |
8854 | + di->next = NULL; | |
8855 | + di->entry++; | |
8856 | + read_unlock(&di->ls->ls_reshash_lock); | |
8857 | + | |
8858 | + return next_rsb(di); /* do the top half of this conditional */ | |
8859 | + } | |
8860 | + } | |
8861 | + di->rsb = list_entry(di->next, gd_res_t, res_hashchain); | |
8862 | + read_unlock(&di->ls->ls_reshash_lock); | |
8863 | + | |
8864 | + return di; | |
8865 | +} | |
8866 | + | |
8867 | +static void *s_start(struct seq_file *m, loff_t * pos) | |
8868 | +{ | |
8869 | + struct ls_dumpinfo *di; | |
8870 | + gd_ls_t *ls; | |
8871 | + int i; | |
8872 | + | |
8873 | + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name)); | |
8874 | + if (!ls) | |
8875 | + return NULL; | |
8876 | + | |
8877 | + di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL); | |
8878 | + if (!di) | |
8879 | + return NULL; | |
8880 | + | |
8881 | + if (*pos == 0) | |
8882 | + seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name); | |
8883 | + | |
8884 | + di->entry = 0; | |
8885 | + di->next = NULL; | |
8886 | + di->ls = ls; | |
8887 | + | |
8888 | + for (i = 0; i < *pos; i++) | |
8889 | + if (next_rsb(di) == NULL) | |
8890 | + return NULL; | |
8891 | + | |
8892 | + return next_rsb(di); | |
8893 | +} | |
8894 | + | |
8895 | +static void *s_next(struct seq_file *m, void *p, loff_t * pos) | |
8896 | +{ | |
8897 | + struct ls_dumpinfo *di = p; | |
8898 | + | |
8899 | + *pos += 1; | |
8900 | + | |
8901 | + return next_rsb(di); | |
8902 | +} | |
8903 | + | |
8904 | +static int s_show(struct seq_file *m, void *p) | |
8905 | +{ | |
8906 | + struct ls_dumpinfo *di = p; | |
8907 | + return print_resource(di->rsb, m); | |
8908 | +} | |
8909 | + | |
8910 | +static void s_stop(struct seq_file *m, void *p) | |
8911 | +{ | |
8912 | + kfree(p); | |
8913 | +} | |
8914 | + | |
8915 | +static struct seq_operations locks_info_op = { | |
8916 | + start:s_start, | |
8917 | + next:s_next, | |
8918 | + stop:s_stop, | |
8919 | + show:s_show | |
8920 | +}; | |
8921 | + | |
8922 | +static char *print_lockmode(int mode) | |
8923 | +{ | |
8924 | + switch (mode) { | |
8925 | + case DLM_LOCK_IV: | |
8926 | + return "--"; | |
8927 | + case DLM_LOCK_NL: | |
8928 | + return "NL"; | |
8929 | + case DLM_LOCK_CR: | |
8930 | + return "CR"; | |
8931 | + case DLM_LOCK_CW: | |
8932 | + return "CW"; | |
8933 | + case DLM_LOCK_PR: | |
8934 | + return "PR"; | |
8935 | + case DLM_LOCK_PW: | |
8936 | + return "PW"; | |
8937 | + case DLM_LOCK_EX: | |
8938 | + return "EX"; | |
8939 | + default: | |
8940 | + return "??"; | |
8941 | + } | |
8942 | +} | |
8943 | + | |
8944 | +static void print_lock(struct seq_file *s, gd_lkb_t * lkb, gd_res_t * res) | |
8945 | +{ | |
8946 | + | |
8947 | + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); | |
8948 | + | |
8949 | + if (lkb->lkb_status == GDLM_LKSTS_CONVERT | |
8950 | + || lkb->lkb_status == GDLM_LKSTS_WAITING) | |
8951 | + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); | |
8952 | + | |
8953 | + if (lkb->lkb_range) { | |
8954 | + /* This warns on Alpha. Tough. Only I see it */ | |
8955 | + if (lkb->lkb_status == GDLM_LKSTS_CONVERT | |
8956 | + || lkb->lkb_status == GDLM_LKSTS_GRANTED) | |
8957 | + seq_printf(s, " %" PRIx64 "-%" PRIx64, | |
8958 | + lkb->lkb_range[GR_RANGE_START], | |
8959 | + lkb->lkb_range[GR_RANGE_END]); | |
8960 | + if (lkb->lkb_status == GDLM_LKSTS_CONVERT | |
8961 | + || lkb->lkb_status == GDLM_LKSTS_WAITING) | |
8962 | + seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")", | |
8963 | + lkb->lkb_range[RQ_RANGE_START], | |
8964 | + lkb->lkb_range[RQ_RANGE_END]); | |
8965 | + } | |
8966 | + | |
8967 | + if (lkb->lkb_nodeid) { | |
8968 | + if (lkb->lkb_nodeid != res->res_nodeid) | |
8969 | + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid, | |
8970 | + lkb->lkb_remid); | |
8971 | + else | |
8972 | + seq_printf(s, " Master: %08x", lkb->lkb_remid); | |
8973 | + } | |
8974 | + | |
8975 | + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) | |
8976 | + seq_printf(s, " LQ: %d", lkb->lkb_lockqueue_state); | |
8977 | + | |
8978 | + seq_printf(s, "\n"); | |
8979 | +} | |
8980 | + | |
8981 | +static int print_resource(gd_res_t *res, struct seq_file *s) | |
8982 | +{ | |
8983 | + int i; | |
8984 | + struct list_head *locklist; | |
8985 | + | |
8986 | + seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res, | |
8987 | + res->res_parent, res->res_length); | |
8988 | + for (i = 0; i < res->res_length; i++) { | |
8989 | + if (isprint(res->res_name[i])) | |
8990 | + seq_printf(s, "%c", res->res_name[i]); | |
8991 | + else | |
8992 | + seq_printf(s, "%c", '.'); | |
8993 | + } | |
8994 | + if (res->res_nodeid) | |
8995 | + seq_printf(s, "\" \nLocal Copy, Master is node %d\n", | |
8996 | + res->res_nodeid); | |
8997 | + else | |
8998 | + seq_printf(s, "\" \nMaster Copy\n"); | |
8999 | + | |
9000 | + /* Print the LVB: */ | |
9001 | + if (res->res_lvbptr) { | |
9002 | + seq_printf(s, "LVB: "); | |
9003 | + for (i = 0; i < DLM_LVB_LEN; i++) { | |
9004 | + if (i == DLM_LVB_LEN / 2) | |
9005 | + seq_printf(s, "\n "); | |
9006 | + seq_printf(s, "%02x ", | |
9007 | + (unsigned char) res->res_lvbptr[i]); | |
9008 | + } | |
9009 | + seq_printf(s, "\n"); | |
9010 | + } | |
9011 | + | |
9012 | + /* Print the locks attached to this resource */ | |
9013 | + seq_printf(s, "Granted Queue\n"); | |
9014 | + list_for_each(locklist, &res->res_grantqueue) { | |
9015 | + gd_lkb_t *this_lkb = | |
9016 | + list_entry(locklist, gd_lkb_t, lkb_statequeue); | |
9017 | + print_lock(s, this_lkb, res); | |
9018 | + } | |
9019 | + | |
9020 | + seq_printf(s, "Conversion Queue\n"); | |
9021 | + list_for_each(locklist, &res->res_convertqueue) { | |
9022 | + gd_lkb_t *this_lkb = | |
9023 | + list_entry(locklist, gd_lkb_t, lkb_statequeue); | |
9024 | + print_lock(s, this_lkb, res); | |
9025 | + } | |
9026 | + | |
9027 | + seq_printf(s, "Waiting Queue\n"); | |
9028 | + list_for_each(locklist, &res->res_waitqueue) { | |
9029 | + gd_lkb_t *this_lkb = | |
9030 | + list_entry(locklist, gd_lkb_t, lkb_statequeue); | |
9031 | + print_lock(s, this_lkb, res); | |
9032 | + } | |
9033 | + return 0; | |
9034 | +} | |
9035 | +#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */ | |
9036 | + | |
9037 | +void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...) | |
9038 | +{ | |
9039 | + va_list va; | |
9040 | + int i, n, size, len; | |
9041 | + char buf[MAX_DEBUG_MSG_LEN+1]; | |
9042 | + | |
9043 | + spin_lock(&debug_lock); | |
9044 | + | |
9045 | + if (!debug_buf) | |
9046 | + goto out; | |
9047 | + | |
9048 | + size = MAX_DEBUG_MSG_LEN; | |
9049 | + memset(buf, 0, size+1); | |
9050 | + | |
9051 | + n = snprintf(buf, size, "%s ", ls->ls_name); | |
9052 | + size -= n; | |
9053 | + | |
9054 | + va_start(va, fmt); | |
9055 | + vsnprintf(buf+n, size, fmt, va); | |
9056 | + va_end(va); | |
9057 | + | |
9058 | + len = strlen(buf); | |
9059 | + if (len > MAX_DEBUG_MSG_LEN-1) | |
9060 | + len = MAX_DEBUG_MSG_LEN-1; | |
9061 | + buf[len] = '\n'; | |
9062 | + buf[len+1] = '\0'; | |
9063 | + | |
9064 | + for (i = 0; i < strlen(buf); i++) { | |
9065 | + debug_buf[debug_point++] = buf[i]; | |
9066 | + | |
9067 | + if (debug_point == debug_size) { | |
9068 | + debug_point = 0; | |
9069 | + debug_wrap = 1; | |
9070 | + } | |
9071 | + } | |
9072 | + out: | |
9073 | + spin_unlock(&debug_lock); | |
9074 | +} | |
9075 | + | |
9076 | +void dlm_debug_dump(void) | |
9077 | +{ | |
9078 | + int i; | |
9079 | + | |
9080 | + spin_lock(&debug_lock); | |
9081 | + if (debug_wrap) { | |
9082 | + for (i = debug_point; i < debug_size; i++) | |
9083 | + printk("%c", debug_buf[i]); | |
9084 | + } | |
9085 | + for (i = 0; i < debug_point; i++) | |
9086 | + printk("%c", debug_buf[i]); | |
9087 | + spin_unlock(&debug_lock); | |
9088 | +} | |
9089 | + | |
9090 | +void dlm_debug_setup(int size) | |
9091 | +{ | |
9092 | + char *b = NULL; | |
9093 | + | |
9094 | + if (size > PAGE_SIZE) | |
9095 | + size = PAGE_SIZE; | |
9096 | + if (size) | |
9097 | + b = kmalloc(size, GFP_KERNEL); | |
9098 | + | |
9099 | + spin_lock(&debug_lock); | |
9100 | + if (debug_buf) | |
9101 | + kfree(debug_buf); | |
9102 | + if (!size || !b) | |
9103 | + goto out; | |
9104 | + debug_size = size; | |
9105 | + debug_point = 0; | |
9106 | + debug_wrap = 0; | |
9107 | + debug_buf = b; | |
9108 | + memset(debug_buf, 0, debug_size); | |
9109 | + out: | |
9110 | + spin_unlock(&debug_lock); | |
9111 | +} | |
9112 | + | |
9113 | +static void dlm_debug_init(void) | |
9114 | +{ | |
9115 | + debug_buf = NULL; | |
9116 | + debug_size = 0; | |
9117 | + debug_point = 0; | |
9118 | + debug_wrap = 0; | |
9119 | + spin_lock_init(&debug_lock); | |
9120 | + | |
9121 | + dlm_debug_setup(DLM_DEBUG_SIZE); | |
9122 | +} | |
9123 | + | |
9124 | +#ifdef CONFIG_PROC_FS | |
9125 | +int dlm_debug_info(char *b, char **start, off_t offset, int length) | |
9126 | +{ | |
9127 | + int i, n = 0; | |
9128 | + | |
9129 | + spin_lock(&debug_lock); | |
9130 | + | |
9131 | + if (debug_wrap) { | |
9132 | + for (i = debug_point; i < debug_size; i++) | |
9133 | + n += sprintf(b + n, "%c", debug_buf[i]); | |
9134 | + } | |
9135 | + for (i = 0; i < debug_point; i++) | |
9136 | + n += sprintf(b + n, "%c", debug_buf[i]); | |
9137 | + | |
9138 | + spin_unlock(&debug_lock); | |
9139 | + | |
9140 | + return n; | |
9141 | +} | |
9142 | + | |
9143 | +int dlm_rcom_info(char *b, char **start, off_t offset, int length) | |
9144 | +{ | |
9145 | + gd_ls_t *ls; | |
9146 | + gd_csb_t *csb; | |
9147 | + int n = 0; | |
9148 | + | |
9149 | + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name)); | |
9150 | + if (!ls) | |
9151 | + return 0; | |
9152 | + | |
9153 | + n += sprintf(b + n, "nodeid names_send_count names_send_msgid " | |
9154 | + "names_recv_count names_recv_msgid " | |
9155 | + "locks_send_count locks_send_msgid " | |
9156 | + "locks_recv_count locks_recv_msgid\n"); | |
9157 | + | |
9158 | + list_for_each_entry(csb, &ls->ls_nodes, csb_list) { | |
9159 | + n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n", | |
9160 | + csb->csb_node->gn_nodeid, | |
9161 | + csb->csb_names_send_count, | |
9162 | + csb->csb_names_send_msgid, | |
9163 | + csb->csb_names_recv_count, | |
9164 | + csb->csb_names_recv_msgid, | |
9165 | + csb->csb_locks_send_count, | |
9166 | + csb->csb_locks_send_msgid, | |
9167 | + csb->csb_locks_recv_count, | |
9168 | + csb->csb_locks_recv_msgid); | |
9169 | + } | |
9170 | + return n; | |
9171 | +} | |
9172 | +#endif | |
9173 | + | |
9174 | +void dlm_proc_init(void) | |
9175 | +{ | |
9176 | +#ifdef CONFIG_PROC_FS | |
9177 | + debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO, | |
9178 | + NULL); | |
9179 | + if (!debug_proc_entry) | |
9180 | + return; | |
9181 | + | |
9182 | + debug_proc_entry->get_info = &dlm_debug_info; | |
9183 | + | |
9184 | + rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL); | |
9185 | + if (!rcom_proc_entry) | |
9186 | + return; | |
9187 | + | |
9188 | + rcom_proc_entry->get_info = &dlm_rcom_info; | |
9189 | +#endif | |
9190 | + dlm_debug_init(); | |
9191 | + | |
9192 | +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS | |
9193 | + locks_proc_entry = create_proc_read_entry("cluster/dlm_locks", | |
9194 | + S_IFREG | 0400, | |
9195 | + NULL, NULL, NULL); | |
9196 | + if (!locks_proc_entry) | |
9197 | + return; | |
9198 | + locks_proc_entry->proc_fops = &locks_fops; | |
9199 | +#endif | |
9200 | +} | |
9201 | + | |
9202 | +void dlm_proc_exit(void) | |
9203 | +{ | |
9204 | +#ifdef CONFIG_PROC_FS | |
9205 | + if (debug_proc_entry) { | |
9206 | + remove_proc_entry("cluster/dlm_debug", NULL); | |
9207 | + dlm_debug_setup(0); | |
9208 | + } | |
9209 | + | |
9210 | + if (rcom_proc_entry) | |
9211 | + remove_proc_entry("cluster/dlm_rcom", NULL); | |
9212 | +#endif | |
9213 | + | |
9214 | +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS | |
9215 | + if (locks_proc_entry) | |
9216 | + remove_proc_entry("cluster/dlm_locks", NULL); | |
9217 | +#endif | |
9218 | +} | |
9219 | diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c | |
9220 | --- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b AM |
9221 | +++ linux-patched/cluster/dlm/queries.c 2004-06-29 20:01:20.000000000 +0800 |
9222 | @@ -0,0 +1,696 @@ | |
4bf12011 | 9223 | +/****************************************************************************** |
9224 | +******************************************************************************* | |
9225 | +** | |
9226 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
9227 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
9228 | +** | |
9229 | +** This copyrighted material is made available to anyone wishing to use, | |
9230 | +** modify, copy, or redistribute it subject to the terms and conditions | |
9231 | +** of the GNU General Public License v.2. | |
9232 | +** | |
9233 | +******************************************************************************* | |
9234 | +******************************************************************************/ | |
9235 | + | |
9236 | +/* | |
9237 | + * queries.c | |
9238 | + * | |
9239 | + * This file provides the kernel query interface to the DLM. | |
9240 | + * | |
9241 | + */ | |
9242 | + | |
9243 | +#define EXPORT_SYMTAB | |
9244 | +#include <linux/module.h> | |
9245 | + | |
9246 | +#include "dlm_internal.h" | |
5cdbd17b | 9247 | +#include "lockspace.h" |
4bf12011 | 9248 | +#include "lockqueue.h" |
9249 | +#include "locking.h" | |
9250 | +#include "lkb.h" | |
9251 | +#include "nodes.h" | |
9252 | +#include "dir.h" | |
9253 | +#include "ast.h" | |
9254 | +#include "memory.h" | |
9255 | +#include "lowcomms.h" | |
9256 | +#include "midcomms.h" | |
9257 | +#include "rsb.h" | |
9258 | + | |
9259 | +static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo); | |
9260 | +static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo); | |
9261 | + | |
9262 | +/* | |
9263 | + * API entry point. | |
9264 | + */ | |
9265 | +int dlm_query(void *lockspace, | |
9266 | + struct dlm_lksb *lksb, | |
9267 | + int query, | |
9268 | + struct dlm_queryinfo *qinfo, | |
9269 | + void (ast_routine(void *)), | |
9270 | + void *astarg) | |
9271 | +{ | |
9272 | + int status = -EINVAL; | |
9273 | + gd_lkb_t *target_lkb; | |
9274 | + gd_lkb_t *query_lkb = NULL; /* Our temporary LKB */ | |
9275 | + gd_ls_t *ls = (gd_ls_t *) find_lockspace_by_local_id(lockspace); | |
9276 | + | |
9277 | + | |
9278 | + if (!qinfo) | |
9279 | + goto out; | |
9280 | + if (!ls) | |
9281 | + goto out; | |
9282 | + if (!ast_routine) | |
9283 | + goto out; | |
9284 | + if (!lksb) | |
9285 | + goto out; | |
9286 | + | |
9287 | + if (!qinfo->gqi_lockinfo) | |
9288 | + qinfo->gqi_locksize = 0; | |
9289 | + | |
9290 | + /* Find the lkid */ | |
9291 | + target_lkb = find_lock_by_id(ls, lksb->sb_lkid); | |
9292 | + if (!target_lkb) | |
9293 | + goto out; | |
9294 | + | |
9295 | + /* If the user wants a list of locks that are blocking or | |
9296 | + not blocking this lock, then it must be waiting | |
9297 | + for something | |
9298 | + */ | |
9299 | + if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING || | |
9300 | + (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) && | |
9301 | + target_lkb->lkb_status == GDLM_LKSTS_GRANTED) | |
9302 | + return -EINVAL; | |
9303 | + | |
9304 | + /* We now allocate an LKB for our own use (so we can hang | |
9305 | + * things like the AST routine and the lksb from it) */ | |
9306 | + lksb->sb_status = -EBUSY; | |
9307 | + query_lkb = create_lkb(ls); | |
9308 | + if (!query_lkb) { | |
9309 | + status = -ENOMEM; | |
9310 | + goto out; | |
9311 | + } | |
9312 | + query_lkb->lkb_astaddr = ast_routine; | |
9313 | + query_lkb->lkb_astparam = (long)astarg; | |
9314 | + query_lkb->lkb_resource = target_lkb->lkb_resource; | |
9315 | + query_lkb->lkb_lksb = lksb; | |
9316 | + | |
9317 | + /* Don't free the resource while we are querying it. This ref | |
9318 | + * will be dropped when the LKB is freed */ | |
9319 | + hold_rsb(query_lkb->lkb_resource); | |
9320 | + | |
9321 | + /* Fill in the stuff that's always local */ | |
9322 | + if (qinfo->gqi_resinfo) { | |
9323 | + if (target_lkb->lkb_resource->res_nodeid) | |
9324 | + qinfo->gqi_resinfo->rsi_masternode = | |
9325 | + target_lkb->lkb_resource->res_nodeid; | |
9326 | + else | |
9327 | + qinfo->gqi_resinfo->rsi_masternode = our_nodeid(); | |
9328 | + qinfo->gqi_resinfo->rsi_length = | |
9329 | + target_lkb->lkb_resource->res_length; | |
9330 | + memcpy(qinfo->gqi_resinfo->rsi_name, | |
9331 | + target_lkb->lkb_resource->res_name, | |
9332 | + qinfo->gqi_resinfo->rsi_length); | |
9333 | + } | |
9334 | + | |
9335 | + /* If the master is local (or the user doesn't want the overhead of a | |
9336 | + * remote call) - fill in the details here */ | |
9337 | + if (target_lkb->lkb_resource->res_nodeid == 0 || | |
9338 | + (query & DLM_QUERY_LOCAL)) { | |
9339 | + | |
9340 | + status = 0; | |
9341 | + /* Resource info */ | |
9342 | + if (qinfo->gqi_resinfo) { | |
9343 | + query_resource(target_lkb->lkb_resource, | |
9344 | + qinfo->gqi_resinfo); | |
9345 | + } | |
9346 | + | |
9347 | + /* Lock lists */ | |
9348 | + if (qinfo->gqi_lockinfo) { | |
9349 | + status = query_locks(query, target_lkb, qinfo); | |
9350 | + } | |
9351 | + | |
9352 | + query_lkb->lkb_retstatus = status; | |
5cdbd17b | 9353 | + queue_ast(query_lkb, AST_COMP | AST_DEL, 0); |
4bf12011 | 9354 | + wake_astd(); |
9355 | + | |
9356 | + /* An AST will be delivered so we must return success here */ | |
9357 | + status = 0; | |
9358 | + goto out; | |
9359 | + } | |
9360 | + | |
9361 | + /* Remote master */ | |
9362 | + if (target_lkb->lkb_resource->res_nodeid != 0) | |
9363 | + { | |
9364 | + struct gd_remquery *remquery; | |
9365 | + struct writequeue_entry *e; | |
9366 | + | |
9367 | + /* Clear this cos the receiving end adds to it with | |
9368 | + each incoming packet */ | |
9369 | + qinfo->gqi_lockcount = 0; | |
9370 | + | |
9371 | + /* Squirrel a pointer to the query info struct | |
9372 | + somewhere illegal */ | |
9373 | + query_lkb->lkb_request = (struct gd_remlockrequest *) qinfo; | |
9374 | + | |
9375 | + e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid, | |
9376 | + sizeof(struct gd_remquery), | |
9377 | + ls->ls_allocation, | |
9378 | + (char **) &remquery); | |
9379 | + if (!e) { | |
9380 | + status = -ENOBUFS; | |
9381 | + goto out; | |
9382 | + } | |
9383 | + | |
9384 | + /* Build remote packet */ | |
9385 | + memset(remquery, 0, sizeof(struct gd_remquery)); | |
9386 | + | |
9387 | + remquery->rq_maxlocks = qinfo->gqi_locksize; | |
9388 | + remquery->rq_query = query; | |
9389 | + remquery->rq_mstlkid = target_lkb->lkb_remid; | |
9390 | + if (qinfo->gqi_lockinfo) | |
9391 | + remquery->rq_maxlocks = qinfo->gqi_locksize; | |
9392 | + | |
9393 | + remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY; | |
9394 | + remquery->rq_header.rh_flags = 0; | |
9395 | + remquery->rq_header.rh_length = sizeof(struct gd_remquery); | |
9396 | + remquery->rq_header.rh_lkid = query_lkb->lkb_id; | |
9397 | + remquery->rq_header.rh_lockspace = ls->ls_global_id; | |
9398 | + | |
9399 | + midcomms_send_buffer(&remquery->rq_header, e); | |
9400 | + status = 0; | |
9401 | + } | |
9402 | + | |
9403 | + out: | |
9404 | + | |
9405 | + return status; | |
9406 | +} | |
9407 | + | |
9408 | +static inline int valid_range(struct dlm_range *r) | |
9409 | +{ | |
9410 | + if (r->ra_start != 0ULL || | |
9411 | + r->ra_end != 0xFFFFFFFFFFFFFFFFULL) | |
9412 | + return 1; | |
9413 | + else | |
9414 | + return 0; | |
9415 | +} | |
9416 | + | |
9417 | +static void put_int(int x, char *buf, int *offp) | |
9418 | +{ | |
9419 | + x = cpu_to_le32(x); | |
9420 | + memcpy(buf + *offp, &x, sizeof(int)); | |
9421 | + *offp += sizeof(int); | |
9422 | +} | |
9423 | + | |
9424 | +static void put_int64(uint64_t x, char *buf, int *offp) | |
9425 | +{ | |
9426 | + x = cpu_to_le64(x); | |
9427 | + memcpy(buf + *offp, &x, sizeof(uint64_t)); | |
9428 | + *offp += sizeof(uint64_t); | |
9429 | +} | |
9430 | + | |
9431 | +static int get_int(char *buf, int *offp) | |
9432 | +{ | |
9433 | + int value; | |
9434 | + memcpy(&value, buf + *offp, sizeof(int)); | |
9435 | + *offp += sizeof(int); | |
9436 | + return le32_to_cpu(value); | |
9437 | +} | |
9438 | + | |
9439 | +static uint64_t get_int64(char *buf, int *offp) | |
9440 | +{ | |
9441 | + uint64_t value; | |
9442 | + | |
9443 | + memcpy(&value, buf + *offp, sizeof(uint64_t)); | |
9444 | + *offp += sizeof(uint64_t); | |
9445 | + return le64_to_cpu(value); | |
9446 | +} | |
9447 | + | |
9448 | +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4) | |
9449 | + | |
9450 | +/* Called from recvd to get lock info for a remote node */ | |
9451 | +int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg) | |
9452 | +{ | |
9453 | + struct gd_remquery *query = (struct gd_remquery *) msg; | |
9454 | + struct gd_remqueryreply *reply; | |
9455 | + struct dlm_resinfo resinfo; | |
9456 | + struct dlm_queryinfo qinfo; | |
9457 | + struct writequeue_entry *e; | |
9458 | + char *buf; | |
9459 | + gd_lkb_t *lkb; | |
9460 | + int status = 0; | |
9461 | + int bufidx; | |
9462 | + int finished = 0; | |
9463 | + int cur_lock = 0; | |
9464 | + int start_lock = 0; | |
9465 | + | |
9466 | + lkb = find_lock_by_id(ls, query->rq_mstlkid); | |
9467 | + if (!lkb) { | |
9468 | + status = -EINVAL; | |
9469 | + goto send_error; | |
9470 | + } | |
9471 | + | |
9472 | + qinfo.gqi_resinfo = &resinfo; | |
9473 | + qinfo.gqi_locksize = query->rq_maxlocks; | |
9474 | + | |
9475 | + /* Get the resource bits */ | |
9476 | + query_resource(lkb->lkb_resource, &resinfo); | |
9477 | + | |
9478 | + /* Now get the locks if wanted */ | |
9479 | + if (query->rq_maxlocks) { | |
9480 | + qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks, | |
9481 | + GFP_KERNEL); | |
9482 | + if (!qinfo.gqi_lockinfo) { | |
9483 | + status = -ENOMEM; | |
9484 | + goto send_error; | |
9485 | + } | |
9486 | + | |
9487 | + status = query_locks(query->rq_query, lkb, &qinfo); | |
9488 | + if (status && status != -E2BIG) { | |
9489 | + kfree(qinfo.gqi_lockinfo); | |
9490 | + goto send_error; | |
9491 | + } | |
9492 | + } | |
9493 | + else { | |
9494 | + qinfo.gqi_lockinfo = NULL; | |
9495 | + qinfo.gqi_lockcount = 0; | |
9496 | + } | |
9497 | + | |
9498 | + /* Send as many blocks as needed for all the locks */ | |
9499 | + do { | |
9500 | + int i; | |
9501 | + int msg_len = sizeof(struct gd_remqueryreply); | |
9502 | + int last_msg_len = msg_len; /* keeps compiler quiet */ | |
9503 | + int last_lock; | |
9504 | + | |
9505 | + /* First work out how many locks we can fit into a block */ | |
9506 | + for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) { | |
9507 | + | |
9508 | + last_msg_len = msg_len; | |
9509 | + | |
9510 | + msg_len += LOCK_LEN; | |
9511 | + if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) || | |
9512 | + valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) { | |
9513 | + | |
9514 | + msg_len += sizeof(uint64_t) * 4; | |
9515 | + } | |
9516 | + } | |
9517 | + | |
9518 | + /* There must be a neater way of doing this... */ | |
9519 | + if (msg_len > PAGE_SIZE) { | |
9520 | + last_lock = i-1; | |
9521 | + msg_len = last_msg_len; | |
9522 | + } | |
9523 | + else { | |
9524 | + last_lock = i; | |
9525 | + } | |
9526 | + | |
9527 | + e = lowcomms_get_buffer(nodeid, | |
9528 | + msg_len, | |
9529 | + ls->ls_allocation, | |
9530 | + (char **) &reply); | |
9531 | + if (!e) { | |
9532 | + kfree(qinfo.gqi_lockinfo); | |
9533 | + status = -ENOBUFS; | |
9534 | + goto out; | |
9535 | + } | |
9536 | + | |
9537 | + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY; | |
9538 | + reply->rq_header.rh_length = msg_len; | |
9539 | + reply->rq_header.rh_lkid = msg->rh_lkid; | |
9540 | + reply->rq_header.rh_lockspace = msg->rh_lockspace; | |
9541 | + | |
9542 | + reply->rq_status = status; | |
9543 | + reply->rq_startlock = cur_lock; | |
9544 | + reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount; | |
9545 | + reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount; | |
9546 | + reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount; | |
9547 | + memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN); | |
9548 | + | |
9549 | + buf = (char *)reply; | |
9550 | + bufidx = sizeof(struct gd_remqueryreply); | |
9551 | + | |
9552 | + for (; cur_lock < last_lock; cur_lock++) { | |
9553 | + | |
9554 | + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state; | |
9555 | + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode; | |
9556 | + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode; | |
9557 | + put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx); | |
9558 | + put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx); | |
9559 | + put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx); | |
9560 | + put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx); | |
9561 | + | |
9562 | + if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) || | |
9563 | + valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) { | |
9564 | + | |
9565 | + buf[bufidx++] = 1; | |
9566 | + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx); | |
9567 | + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx); | |
9568 | + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx); | |
9569 | + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx); | |
9570 | + } | |
9571 | + else { | |
9572 | + buf[bufidx++] = 0; | |
9573 | + } | |
9574 | + } | |
9575 | + | |
9576 | + if (cur_lock == qinfo.gqi_lockcount) { | |
9577 | + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; | |
9578 | + finished = 1; | |
9579 | + } | |
9580 | + else { | |
9581 | + reply->rq_header.rh_flags = 0; | |
9582 | + } | |
9583 | + | |
9584 | + reply->rq_numlocks = cur_lock - start_lock; | |
9585 | + start_lock = cur_lock; | |
9586 | + | |
9587 | + midcomms_send_buffer(&reply->rq_header, e); | |
9588 | + } while (!finished); | |
9589 | + | |
9590 | + kfree(qinfo.gqi_lockinfo); | |
9591 | + out: | |
9592 | + return status; | |
9593 | + | |
9594 | + send_error: | |
9595 | + e = lowcomms_get_buffer(nodeid, | |
9596 | + sizeof(struct gd_remqueryreply), | |
9597 | + ls->ls_allocation, | |
9598 | + (char **) &reply); | |
9599 | + if (!e) { | |
9600 | + status = -ENOBUFS; | |
9601 | + goto out; | |
9602 | + } | |
9603 | + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY; | |
9604 | + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */ | |
9605 | + reply->rq_header.rh_length = sizeof(struct gd_remqueryreply); | |
9606 | + reply->rq_header.rh_lkid = msg->rh_lkid; | |
9607 | + reply->rq_header.rh_lockspace = msg->rh_lockspace; | |
9608 | + reply->rq_status = status; | |
9609 | + reply->rq_numlocks = 0; | |
9610 | + reply->rq_startlock = 0; | |
9611 | + reply->rq_grantcount = 0; | |
9612 | + reply->rq_convcount = 0; | |
9613 | + reply->rq_waitcount = 0; | |
9614 | + | |
9615 | + midcomms_send_buffer(&reply->rq_header, e); | |
9616 | + | |
9617 | + return status; | |
9618 | +} | |
9619 | + | |
9620 | +/* Reply to a remote query */ | |
9621 | +int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg) | |
9622 | +{ | |
9623 | + gd_lkb_t *query_lkb; | |
9624 | + struct dlm_queryinfo *qinfo; | |
9625 | + struct gd_remqueryreply *reply; | |
9626 | + char *buf; | |
9627 | + int i; | |
9628 | + int bufidx; | |
9629 | + | |
9630 | + query_lkb = find_lock_by_id(ls, msg->rh_lkid); | |
9631 | + if (!query_lkb) | |
9632 | + return -EINVAL; | |
9633 | + | |
9634 | + qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request; | |
9635 | + reply = (struct gd_remqueryreply *) msg; | |
9636 | + | |
9637 | + /* Copy the easy bits first */ | |
9638 | + qinfo->gqi_lockcount += reply->rq_numlocks; | |
9639 | + if (qinfo->gqi_resinfo) { | |
9640 | + qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount; | |
9641 | + qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount; | |
9642 | + qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount; | |
9643 | + memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk, | |
9644 | + DLM_LVB_LEN); | |
9645 | + } | |
9646 | + | |
9647 | + /* Now unpack the locks */ | |
9648 | + bufidx = sizeof(struct gd_remqueryreply); | |
9649 | + buf = (char *) msg; | |
9650 | + | |
9651 | + GDLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize, | |
9652 | + printk("start = %d, num + %d. Max= %d\n", | |
9653 | + reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize);); | |
9654 | + | |
9655 | + for (i = reply->rq_startlock; | |
9656 | + i < reply->rq_startlock + reply->rq_numlocks; i++) { | |
9657 | + qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++]; | |
9658 | + qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++]; | |
9659 | + qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++]; | |
9660 | + qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx); | |
9661 | + qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx); | |
9662 | + qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx); | |
9663 | + qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx); | |
9664 | + if (buf[bufidx++]) { | |
9665 | + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx); | |
9666 | + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx); | |
9667 | + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx); | |
9668 | + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx); | |
9669 | + } | |
9670 | + else { | |
9671 | + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL; | |
9672 | + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL; | |
9673 | + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL; | |
9674 | + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL; | |
9675 | + } | |
9676 | + } | |
9677 | + | |
9678 | + /* If this was the last block then now tell the user */ | |
9679 | + if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) { | |
9680 | + query_lkb->lkb_retstatus = reply->rq_status; | |
5cdbd17b | 9681 | + queue_ast(query_lkb, AST_COMP | AST_DEL, 0); |
4bf12011 | 9682 | + wake_astd(); |
9683 | + } | |
9684 | + | |
9685 | + return 0; | |
9686 | +} | |
9687 | + | |
9688 | +/* Aggregate resource information */ | |
9689 | +static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo) | |
9690 | +{ | |
9691 | + struct list_head *tmp; | |
9692 | + | |
9693 | + | |
9694 | + if (rsb->res_lvbptr) | |
9695 | + memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN); | |
9696 | + | |
9697 | + resinfo->rsi_grantcount = 0; | |
9698 | + list_for_each(tmp, &rsb->res_grantqueue) { | |
9699 | + resinfo->rsi_grantcount++; | |
9700 | + } | |
9701 | + | |
9702 | + resinfo->rsi_waitcount = 0; | |
9703 | + list_for_each(tmp, &rsb->res_waitqueue) { | |
9704 | + resinfo->rsi_waitcount++; | |
9705 | + } | |
9706 | + | |
9707 | + resinfo->rsi_convcount = 0; | |
9708 | + list_for_each(tmp, &rsb->res_convertqueue) { | |
9709 | + resinfo->rsi_convcount++; | |
9710 | + } | |
9711 | + | |
9712 | + return 0; | |
9713 | +} | |
9714 | + | |
9715 | +static int add_lock(gd_lkb_t *lkb, struct dlm_queryinfo *qinfo) | |
9716 | +{ | |
9717 | + int entry; | |
9718 | + | |
9719 | + /* Don't fill it in if the buffer is full */ | |
9720 | + if (qinfo->gqi_lockcount == qinfo->gqi_locksize) | |
9721 | + return -E2BIG; | |
9722 | + | |
9723 | + /* gqi_lockcount contains the number of locks we have returned */ | |
9724 | + entry = qinfo->gqi_lockcount++; | |
9725 | + | |
9726 | + /* Fun with master copies */ | |
9727 | + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) { | |
9728 | + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid; | |
9729 | + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id; | |
9730 | + } | |
9731 | + else { | |
9732 | + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id; | |
9733 | + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid; | |
9734 | + } | |
9735 | + | |
9736 | + /* Also make sure we always have a valid nodeid in there, the | |
9737 | + calling end may not know which node "0" is */ | |
9738 | + if (lkb->lkb_nodeid) | |
9739 | + qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid; | |
9740 | + else | |
9741 | + qinfo->gqi_lockinfo[entry].lki_node = our_nodeid(); | |
9742 | + | |
9743 | + if (lkb->lkb_parent) | |
9744 | + qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id; | |
9745 | + else | |
9746 | + qinfo->gqi_lockinfo[entry].lki_parent = 0; | |
9747 | + | |
9748 | + qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status; | |
9749 | + qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode; | |
9750 | + qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode; | |
9751 | + | |
9752 | + if (lkb->lkb_range) { | |
9753 | + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = | |
9754 | + lkb->lkb_range[GR_RANGE_START]; | |
9755 | + qinfo->gqi_lockinfo[entry].lki_grrange.ra_end = | |
9756 | + lkb->lkb_range[GR_RANGE_END]; | |
9757 | + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = | |
9758 | + lkb->lkb_range[RQ_RANGE_START]; | |
9759 | + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end = | |
9760 | + lkb->lkb_range[RQ_RANGE_END]; | |
9761 | + } else { | |
9762 | + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL; | |
9763 | + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL; | |
9764 | + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL; | |
9765 | + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL; | |
9766 | + } | |
9767 | + return 0; | |
9768 | +} | |
9769 | + | |
9770 | +static int query_lkb_queue(struct list_head *queue, int query, | |
9771 | + struct dlm_queryinfo *qinfo) | |
9772 | +{ | |
9773 | + struct list_head *tmp; | |
9774 | + int status = 0; | |
9775 | + int mode = query & DLM_QUERY_MODE_MASK; | |
9776 | + | |
9777 | + list_for_each(tmp, queue) { | |
9778 | + gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue); | |
9779 | + int lkmode; | |
9780 | + | |
9781 | + if (query & DLM_QUERY_RQMODE) | |
9782 | + lkmode = lkb->lkb_rqmode; | |
9783 | + else | |
9784 | + lkmode = lkb->lkb_grmode; | |
9785 | + | |
9786 | + /* Add the LKB info to the list if it matches the criteria in | |
9787 | + * the query bitmap */ | |
9788 | + switch (query & DLM_QUERY_MASK) { | |
9789 | + case DLM_QUERY_LOCKS_ALL: | |
9790 | + status = add_lock(lkb, qinfo); | |
9791 | + break; | |
9792 | + | |
9793 | + case DLM_QUERY_LOCKS_HIGHER: | |
9794 | + if (lkmode > mode) | |
9795 | + status = add_lock(lkb, qinfo); | |
9796 | + break; | |
9797 | + | |
9798 | + case DLM_QUERY_LOCKS_EQUAL: | |
9799 | + if (lkmode == mode) | |
9800 | + status = add_lock(lkb, qinfo); | |
9801 | + break; | |
9802 | + | |
9803 | + case DLM_QUERY_LOCKS_LOWER: | |
9804 | + if (lkmode < mode) | |
9805 | + status = add_lock(lkb, qinfo); | |
9806 | + break; | |
9807 | + } | |
9808 | + } | |
9809 | + return status; | |
9810 | +} | |
9811 | + | |
9812 | +/* | |
9813 | + * Return 1 if the locks' ranges overlap | |
9814 | + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff | |
9815 | + */ | |
9816 | +static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2) | |
9817 | +{ | |
9818 | + if (!lkb1->lkb_range || !lkb2->lkb_range) | |
9819 | + return 1; | |
9820 | + | |
9821 | + if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] || | |
9822 | + lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END]) | |
9823 | + return 0; | |
9824 | + | |
9825 | + return 1; | |
9826 | +} | |
9827 | +extern const int __dlm_compat_matrix[8][8]; | |
9828 | + | |
9829 | + | |
9830 | +static int get_blocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo) | |
9831 | +{ | |
9832 | + struct list_head *tmp; | |
9833 | + int status = 0; | |
9834 | + | |
9835 | + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) { | |
9836 | + gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue); | |
9837 | + | |
9838 | + if (ranges_overlap(lkb, qlkb) && | |
9839 | + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]) | |
9840 | + status = add_lock(lkb, qinfo); | |
9841 | + } | |
9842 | + | |
9843 | + return status; | |
9844 | +} | |
9845 | + | |
9846 | +static int get_nonblocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo) | |
9847 | +{ | |
9848 | + struct list_head *tmp; | |
9849 | + int status = 0; | |
9850 | + | |
9851 | + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) { | |
9852 | + gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue); | |
9853 | + | |
9854 | + if (!(ranges_overlap(lkb, qlkb) && | |
9855 | + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])) | |
9856 | + status = add_lock(lkb, qinfo); | |
9857 | + } | |
9858 | + | |
9859 | + return status; | |
9860 | +} | |
9861 | + | |
9862 | +/* Gather a list of appropriate locks */ | |
9863 | +static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo) | |
9864 | +{ | |
9865 | + int status = 0; | |
9866 | + | |
9867 | + | |
9868 | + /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS | |
9869 | + * was requested as the mode | |
9870 | + */ | |
9871 | + if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) { | |
9872 | + query &= ~DLM_QUERY_MODE_MASK; | |
9873 | + if (query & DLM_QUERY_RQMODE) | |
9874 | + query |= lkb->lkb_rqmode; | |
9875 | + else | |
9876 | + query |= lkb->lkb_grmode; | |
9877 | + } | |
9878 | + | |
9879 | + qinfo->gqi_lockcount = 0; | |
9880 | + | |
9881 | + /* BLOCKING/NOTBLOCK only look at the granted queue */ | |
9882 | + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING) | |
9883 | + return get_blocking_locks(lkb, qinfo); | |
9884 | + | |
9885 | + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) | |
9886 | + return get_nonblocking_locks(lkb, qinfo); | |
9887 | + | |
9888 | + /* Do the lock queues that were requested */ | |
9889 | + if (query & DLM_QUERY_QUEUE_GRANT) { | |
9890 | + status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue, | |
9891 | + query, qinfo); | |
9892 | + } | |
9893 | + | |
9894 | + if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) { | |
9895 | + status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue, | |
9896 | + query, qinfo); | |
9897 | + } | |
9898 | + | |
9899 | + if (!status && (query & DLM_QUERY_QUEUE_WAIT)) { | |
9900 | + status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue, | |
9901 | + query, qinfo); | |
9902 | + } | |
9903 | + | |
9904 | + | |
9905 | + return status; | |
9906 | +} | |
9907 | + | |
9908 | +EXPORT_SYMBOL(dlm_query); | |
9909 | +/* | |
9910 | + * Overrides for Emacs so that we follow Linus's tabbing style. | |
9911 | + * Emacs will notice this stuff at the end of the file and automatically | |
9912 | + * adjust the settings for this buffer only. This must remain at the end | |
9913 | + * of the file. | |
9914 | + * --------------------------------------------------------------------------- | |
9915 | + * Local variables: | |
9916 | + * c-file-style: "linux" | |
9917 | + * End: | |
9918 | + */ | |
9919 | diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h | |
9920 | --- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 9921 | +++ linux-patched/cluster/dlm/queries.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 9922 | @@ -0,0 +1,20 @@ |
9923 | +/****************************************************************************** | |
9924 | +******************************************************************************* | |
9925 | +** | |
9926 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
9927 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
9928 | +** | |
9929 | +** This copyrighted material is made available to anyone wishing to use, | |
9930 | +** modify, copy, or redistribute it subject to the terms and conditions | |
9931 | +** of the GNU General Public License v.2. | |
9932 | +** | |
9933 | +******************************************************************************* | |
9934 | +******************************************************************************/ | |
9935 | + | |
9936 | +#ifndef __QUERIES_DOT_H__ | |
9937 | +#define __QUERIES_DOT_H__ | |
9938 | + | |
9939 | +extern int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg); | |
9940 | +extern int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg); | |
9941 | + | |
9942 | +#endif /* __QUERIES_DOT_H__ */ | |
9943 | diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c | |
9944 | --- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b AM |
9945 | +++ linux-patched/cluster/dlm/rebuild.c 2004-06-29 20:01:20.000000000 +0800 |
9946 | @@ -0,0 +1,1245 @@ | |
4bf12011 | 9947 | +/****************************************************************************** |
9948 | +******************************************************************************* | |
9949 | +** | |
9950 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
9951 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
9952 | +** | |
9953 | +** This copyrighted material is made available to anyone wishing to use, | |
9954 | +** modify, copy, or redistribute it subject to the terms and conditions | |
9955 | +** of the GNU General Public License v.2. | |
9956 | +** | |
9957 | +******************************************************************************* | |
9958 | +******************************************************************************/ | |
9959 | + | |
9960 | +/* | |
9961 | + * Rebuild RSB's on new masters. Functions for transferring locks and | |
9962 | + * subresources to new RSB masters during recovery. | |
9963 | + */ | |
9964 | + | |
9965 | +#include "dlm_internal.h" | |
9966 | +#include "reccomms.h" | |
9967 | +#include "lkb.h" | |
9968 | +#include "rsb.h" | |
9969 | +#include "nodes.h" | |
9970 | +#include "config.h" | |
9971 | +#include "memory.h" | |
9972 | +#include "recover.h" | |
9973 | + | |
9974 | + | |
9975 | +/* Types of entity serialised in remastering messages */ | |
9976 | +#define REMASTER_ROOTRSB 1 | |
9977 | +#define REMASTER_RSB 2 | |
9978 | +#define REMASTER_LKB 3 | |
9979 | + | |
9980 | +struct rcom_fill { | |
9981 | + char * outbuf; /* Beginning of data */ | |
9982 | + int offset; /* Current offset into outbuf */ | |
9983 | + int maxlen; /* Max value of offset */ | |
9984 | + int remasterid; | |
9985 | + int count; | |
9986 | + gd_res_t * rsb; | |
9987 | + gd_res_t * subrsb; | |
9988 | + gd_lkb_t * lkb; | |
9989 | + struct list_head * lkbqueue; | |
9990 | + char more; | |
9991 | +}; | |
9992 | +typedef struct rcom_fill rcom_fill_t; | |
9993 | + | |
9994 | + | |
9995 | +struct rebuild_node { | |
9996 | + struct list_head list; | |
9997 | + int nodeid; | |
9998 | + gd_res_t * rootrsb; | |
9999 | +}; | |
10000 | +typedef struct rebuild_node rebuild_node_t; | |
10001 | + | |
10002 | + | |
10003 | +/* | |
10004 | + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new | |
10005 | + * master. The rsb will be "done" with recovery when the new master has | |
10006 | + * replied with all the new remote lockid's for this rsb's lkb's. | |
10007 | + */ | |
10008 | + | |
10009 | +void expect_new_lkids(gd_res_t *rsb) | |
10010 | +{ | |
10011 | + rsb->res_newlkid_expect = 0; | |
10012 | + recover_list_add(rsb); | |
10013 | +} | |
10014 | + | |
10015 | +/* | |
10016 | + * This function is called on root rsb or subrsb when another lkb is being sent | |
10017 | + * to the new master for which we expect to receive a corresponding remote lkid | |
10018 | + */ | |
10019 | + | |
10020 | +void need_new_lkid(gd_res_t *rsb) | |
10021 | +{ | |
10022 | + gd_res_t *root = rsb; | |
10023 | + | |
10024 | + if (rsb->res_parent) | |
10025 | + root = rsb->res_root; | |
10026 | + | |
10027 | + if (!root->res_newlkid_expect) | |
10028 | + recover_list_add(root); | |
10029 | + else | |
10030 | + GDLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),); | |
10031 | + | |
10032 | + root->res_newlkid_expect++; | |
10033 | +} | |
10034 | + | |
10035 | +/* | |
10036 | + * This function is called for each lkb for which a new remote lkid is | |
10037 | + * received. Decrement the expected number of remote lkids expected for the | |
10038 | + * root rsb. | |
10039 | + */ | |
10040 | + | |
10041 | +void have_new_lkid(gd_lkb_t *lkb) | |
10042 | +{ | |
10043 | + gd_res_t *root = lkb->lkb_resource; | |
10044 | + | |
10045 | + if (root->res_parent) | |
10046 | + root = root->res_root; | |
10047 | + | |
10048 | + down_write(&root->res_lock); | |
10049 | + | |
10050 | + GDLM_ASSERT(root->res_newlkid_expect, | |
10051 | + printk("newlkid_expect=%d\n", root->res_newlkid_expect);); | |
10052 | + | |
10053 | + root->res_newlkid_expect--; | |
10054 | + | |
10055 | + if (!root->res_newlkid_expect) { | |
10056 | + clear_bit(RESFL_NEW_MASTER, &root->res_flags); | |
10057 | + recover_list_del(root); | |
10058 | + } | |
10059 | + up_write(&root->res_lock); | |
10060 | +} | |
10061 | + | |
10062 | +/* | |
10063 | + * Return the rebuild struct for a node - will create an entry on the rootrsb | |
10064 | + * list if necessary. | |
10065 | + * | |
10066 | + * Currently no locking is needed here as it all happens in the gdlm_recvd | |
10067 | + * thread | |
10068 | + */ | |
10069 | + | |
10070 | +static rebuild_node_t *find_rebuild_root(gd_ls_t *ls, int nodeid) | |
10071 | +{ | |
10072 | + rebuild_node_t *node = NULL; | |
10073 | + | |
10074 | + list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) { | |
10075 | + if (node->nodeid == nodeid) | |
10076 | + return node; | |
10077 | + } | |
10078 | + | |
10079 | + /* Not found, add one */ | |
10080 | + node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL); | |
10081 | + if (!node) | |
10082 | + return NULL; | |
10083 | + | |
10084 | + node->nodeid = nodeid; | |
10085 | + node->rootrsb = NULL; | |
10086 | + list_add(&node->list, &ls->ls_rebuild_rootrsb_list); | |
10087 | + | |
10088 | + return node; | |
10089 | +} | |
10090 | + | |
10091 | +/* | |
10092 | + * Tidy up after a rebuild run. Called when all recovery has finished | |
10093 | + */ | |
10094 | + | |
10095 | +void rebuild_freemem(gd_ls_t *ls) | |
10096 | +{ | |
10097 | + rebuild_node_t *node = NULL, *s; | |
10098 | + | |
10099 | + list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) { | |
10100 | + list_del(&node->list); | |
10101 | + kfree(node); | |
10102 | + } | |
10103 | +} | |
10104 | + | |
10105 | +static void put_int(int x, char *buf, int *offp) | |
10106 | +{ | |
10107 | + x = cpu_to_le32(x); | |
10108 | + memcpy(buf + *offp, &x, sizeof(int)); | |
10109 | + *offp += sizeof(int); | |
10110 | +} | |
10111 | + | |
10112 | +static void put_int64(uint64_t x, char *buf, int *offp) | |
10113 | +{ | |
10114 | + x = cpu_to_le64(x); | |
10115 | + memcpy(buf + *offp, &x, sizeof(uint64_t)); | |
10116 | + *offp += sizeof(uint64_t); | |
10117 | +} | |
10118 | + | |
10119 | +static void put_bytes(char *x, int len, char *buf, int *offp) | |
10120 | +{ | |
10121 | + put_int(len, buf, offp); | |
10122 | + memcpy(buf + *offp, x, len); | |
10123 | + *offp += len; | |
10124 | +} | |
10125 | + | |
10126 | +static void put_char(char x, char *buf, int *offp) | |
10127 | +{ | |
10128 | + buf[*offp] = x; | |
10129 | + *offp += 1; | |
10130 | +} | |
10131 | + | |
10132 | +static int get_int(char *buf, int *offp) | |
10133 | +{ | |
10134 | + int value; | |
10135 | + memcpy(&value, buf + *offp, sizeof(int)); | |
10136 | + *offp += sizeof(int); | |
10137 | + return le32_to_cpu(value); | |
10138 | +} | |
10139 | + | |
10140 | +static uint64_t get_int64(char *buf, int *offp) | |
10141 | +{ | |
10142 | + uint64_t value; | |
10143 | + | |
10144 | + memcpy(&value, buf + *offp, sizeof(uint64_t)); | |
10145 | + *offp += sizeof(uint64_t); | |
10146 | + return le64_to_cpu(value); | |
10147 | +} | |
10148 | + | |
10149 | +static char get_char(char *buf, int *offp) | |
10150 | +{ | |
10151 | + char x = buf[*offp]; | |
10152 | + | |
10153 | + *offp += 1; | |
10154 | + return x; | |
10155 | +} | |
10156 | + | |
10157 | +static void get_bytes(char *bytes, int *len, char *buf, int *offp) | |
10158 | +{ | |
10159 | + *len = get_int(buf, offp); | |
10160 | + memcpy(bytes, buf + *offp, *len); | |
10161 | + *offp += *len; | |
10162 | +} | |
10163 | + | |
10164 | +static int lkb_length(gd_lkb_t *lkb) | |
10165 | +{ | |
10166 | + int len = 0; | |
10167 | + | |
10168 | + len += sizeof(int); /* lkb_id */ | |
10169 | + len += sizeof(int); /* lkb_resource->res_reamasterid */ | |
10170 | + len += sizeof(int); /* lkb_flags */ | |
10171 | + len += sizeof(int); /* lkb_status */ | |
10172 | + len += sizeof(char); /* lkb_rqmode */ | |
10173 | + len += sizeof(char); /* lkb_grmode */ | |
10174 | + len += sizeof(int); /* lkb_childcnt */ | |
10175 | + len += sizeof(int); /* lkb_parent->lkb_id */ | |
10176 | + len += sizeof(int); /* lkb_bastaddr */ | |
10177 | + | |
10178 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { | |
10179 | + len += sizeof(int); /* number of lvb bytes */ | |
10180 | + len += DLM_LVB_LEN; | |
10181 | + } | |
10182 | + | |
10183 | + if (lkb->lkb_range) { | |
10184 | + len += sizeof(uint64_t); | |
10185 | + len += sizeof(uint64_t); | |
10186 | + if (lkb->lkb_status == GDLM_LKSTS_CONVERT) { | |
10187 | + len += sizeof(uint64_t); | |
10188 | + len += sizeof(uint64_t); | |
10189 | + } | |
10190 | + } | |
10191 | + | |
10192 | + return len; | |
10193 | +} | |
10194 | + | |
10195 | +/* | |
10196 | + * It's up to the caller to be sure there's enough space in the buffer. | |
10197 | + */ | |
10198 | + | |
10199 | +static void serialise_lkb(gd_lkb_t *lkb, char *buf, int *offp) | |
10200 | +{ | |
10201 | + int flags; | |
10202 | + | |
10203 | + /* Need to tell the remote end if we have a range */ | |
10204 | + flags = lkb->lkb_flags; | |
10205 | + if (lkb->lkb_range) | |
10206 | + flags |= GDLM_LKFLG_RANGE; | |
10207 | + | |
10208 | + /* | |
10209 | + * See lkb_length() | |
10210 | + * Total: 30 (no lvb) or 66 (with lvb) bytes | |
10211 | + */ | |
10212 | + | |
10213 | + put_int(lkb->lkb_id, buf, offp); | |
10214 | + put_int(lkb->lkb_resource->res_remasterid, buf, offp); | |
10215 | + put_int(flags, buf, offp); | |
10216 | + put_int(lkb->lkb_status, buf, offp); | |
10217 | + put_char(lkb->lkb_rqmode, buf, offp); | |
10218 | + put_char(lkb->lkb_grmode, buf, offp); | |
10219 | + put_int(atomic_read(&lkb->lkb_childcnt), buf, offp); | |
10220 | + | |
10221 | + if (lkb->lkb_parent) | |
10222 | + put_int(lkb->lkb_parent->lkb_id, buf, offp); | |
10223 | + else | |
10224 | + put_int(0, buf, offp); | |
10225 | + | |
10226 | + if (lkb->lkb_bastaddr) | |
10227 | + put_int(1, buf, offp); | |
10228 | + else | |
10229 | + put_int(0, buf, offp); | |
10230 | + | |
10231 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { | |
10232 | + GDLM_ASSERT(lkb->lkb_lvbptr,); | |
10233 | + put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp); | |
10234 | + } | |
10235 | + | |
10236 | + /* Only send the range we actually need */ | |
10237 | + if (lkb->lkb_range) { | |
10238 | + switch (lkb->lkb_status) { | |
10239 | + case GDLM_LKSTS_CONVERT: | |
10240 | + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp); | |
10241 | + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp); | |
10242 | + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp); | |
10243 | + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp); | |
10244 | + break; | |
10245 | + case GDLM_LKSTS_WAITING: | |
10246 | + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp); | |
10247 | + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp); | |
10248 | + break; | |
10249 | + case GDLM_LKSTS_GRANTED: | |
10250 | + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp); | |
10251 | + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp); | |
10252 | + break; | |
10253 | + default: | |
10254 | + GDLM_ASSERT(0,); | |
10255 | + } | |
10256 | + } | |
10257 | +} | |
10258 | + | |
10259 | +static int rsb_length(gd_res_t *rsb) | |
10260 | +{ | |
10261 | + int len = 0; | |
10262 | + | |
10263 | + len += sizeof(int); /* number of res_name bytes */ | |
10264 | + len += rsb->res_length; /* res_name */ | |
10265 | + len += sizeof(int); /* res_remasterid */ | |
10266 | + len += sizeof(int); /* res_parent->res_remasterid */ | |
10267 | + | |
10268 | + return len; | |
10269 | +} | |
10270 | + | |
10271 | +static inline gd_res_t *next_subrsb(gd_res_t *subrsb) | |
10272 | +{ | |
10273 | + struct list_head *tmp; | |
10274 | + gd_res_t *r; | |
10275 | + | |
10276 | + tmp = subrsb->res_subreslist.next; | |
10277 | + r = list_entry(tmp, gd_res_t, res_subreslist); | |
10278 | + | |
10279 | + return r; | |
10280 | +} | |
10281 | + | |
10282 | +static inline int last_in_list(gd_res_t *r, struct list_head *head) | |
10283 | +{ | |
10284 | + gd_res_t *last = list_entry(head->prev, gd_res_t, res_subreslist); | |
10285 | + | |
10286 | + if (last == r) | |
10287 | + return 1; | |
10288 | + return 0; | |
10289 | +} | |
10290 | + | |
10291 | +/* | |
10292 | + * Used to decide if an rsb should be rebuilt on a new master. An rsb only | |
10293 | + * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's on the | |
10294 | + * wait queue are not rebuilt. | |
10295 | + */ | |
10296 | + | |
10297 | +static int lkbs_to_remaster(gd_res_t *r) | |
10298 | +{ | |
10299 | + gd_lkb_t *lkb; | |
10300 | + gd_res_t *sub; | |
10301 | + | |
10302 | + if (!list_empty(&r->res_grantqueue) || | |
10303 | + !list_empty(&r->res_convertqueue)) | |
10304 | + return TRUE; | |
10305 | + | |
10306 | + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { | |
10307 | + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD) | |
10308 | + continue; | |
10309 | + return TRUE; | |
10310 | + } | |
10311 | + | |
10312 | + list_for_each_entry(sub, &r->res_subreslist, res_subreslist) { | |
10313 | + if (!list_empty(&sub->res_grantqueue) || | |
10314 | + !list_empty(&sub->res_convertqueue)) | |
10315 | + return TRUE; | |
10316 | + | |
10317 | + list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) { | |
10318 | + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD) | |
10319 | + continue; | |
10320 | + return TRUE; | |
10321 | + } | |
10322 | + } | |
10323 | + | |
10324 | + return FALSE; | |
10325 | +} | |
10326 | + | |
10327 | +static void serialise_rsb(gd_res_t *rsb, char *buf, int *offp) | |
10328 | +{ | |
10329 | + /* | |
10330 | + * See rsb_length() | |
10331 | + * Total: 36 bytes (4 + 24 + 4 + 4) | |
10332 | + */ | |
10333 | + | |
10334 | + put_bytes(rsb->res_name, rsb->res_length, buf, offp); | |
10335 | + put_int(rsb->res_remasterid, buf, offp); | |
10336 | + | |
10337 | + if (rsb->res_parent) | |
10338 | + put_int(rsb->res_parent->res_remasterid, buf, offp); | |
10339 | + else | |
10340 | + put_int(0, buf, offp); | |
10341 | + | |
10342 | + GDLM_ASSERT(!rsb->res_lvbptr,); | |
10343 | +} | |
10344 | + | |
10345 | +/* | |
10346 | + * Flatten an LKB into a buffer for sending to the new RSB master. As a | |
10347 | + * side-effect the nodeid of the lock is set to the nodeid of the new RSB | |
10348 | + * master. | |
10349 | + */ | |
10350 | + | |
10351 | +static int pack_one_lkb(gd_res_t *r, gd_lkb_t *lkb, rcom_fill_t *fill) | |
10352 | +{ | |
10353 | + if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen) | |
10354 | + goto nospace; | |
10355 | + | |
10356 | + lkb->lkb_nodeid = r->res_nodeid; | |
10357 | + | |
10358 | + put_char(REMASTER_LKB, fill->outbuf, &fill->offset); | |
10359 | + serialise_lkb(lkb, fill->outbuf, &fill->offset); | |
10360 | + | |
10361 | + fill->count++; | |
10362 | + need_new_lkid(r); | |
10363 | + return 0; | |
10364 | + | |
10365 | + nospace: | |
10366 | + return -ENOSPC; | |
10367 | +} | |
10368 | + | |
10369 | +/* | |
10370 | + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag. | |
10371 | + */ | |
10372 | + | |
10373 | +static int pack_lkb_queue(gd_res_t *r, struct list_head *queue, | |
10374 | + rcom_fill_t *fill) | |
10375 | +{ | |
10376 | + gd_lkb_t *lkb; | |
10377 | + int error; | |
10378 | + | |
10379 | + list_for_each_entry(lkb, queue, lkb_statequeue) { | |
10380 | + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD) | |
10381 | + continue; | |
10382 | + | |
10383 | + error = pack_one_lkb(r, lkb, fill); | |
10384 | + if (error) | |
10385 | + goto nospace; | |
10386 | + } | |
10387 | + | |
10388 | + return 0; | |
10389 | + | |
10390 | + nospace: | |
10391 | + fill->lkb = lkb; | |
10392 | + fill->lkbqueue = queue; | |
10393 | + | |
10394 | + return error; | |
10395 | +} | |
10396 | + | |
10397 | +static int pack_lkb_queues(gd_res_t *r, rcom_fill_t *fill) | |
10398 | +{ | |
10399 | + int error; | |
10400 | + | |
10401 | + error = pack_lkb_queue(r, &r->res_grantqueue, fill); | |
10402 | + if (error) | |
10403 | + goto nospace; | |
10404 | + | |
10405 | + error = pack_lkb_queue(r, &r->res_convertqueue, fill); | |
10406 | + if (error) | |
10407 | + goto nospace; | |
10408 | + | |
10409 | + error = pack_lkb_queue(r, &r->res_waitqueue, fill); | |
10410 | + | |
10411 | + nospace: | |
10412 | + return error; | |
10413 | +} | |
10414 | + | |
10415 | +/* | |
10416 | + * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb | |
10417 | + * queue and full lkb queues. | |
10418 | + */ | |
10419 | + | |
10420 | +static int pack_lkb_remaining(gd_res_t *r, rcom_fill_t *fill) | |
10421 | +{ | |
10422 | + struct list_head *tmp, *start, *end; | |
10423 | + gd_lkb_t *lkb; | |
10424 | + int error; | |
10425 | + | |
10426 | + /* | |
10427 | + * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue. | |
10428 | + */ | |
10429 | + | |
10430 | + error = pack_one_lkb(r, fill->lkb, fill); | |
10431 | + if (error) | |
10432 | + goto out; | |
10433 | + | |
10434 | + start = fill->lkb->lkb_statequeue.next; | |
10435 | + end = fill->lkbqueue; | |
10436 | + | |
10437 | + for (tmp = start; tmp != end; tmp = tmp->next) { | |
10438 | + lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue); | |
10439 | + | |
10440 | + error = pack_one_lkb(r, lkb, fill); | |
10441 | + if (error) { | |
10442 | + fill->lkb = lkb; | |
10443 | + goto out; | |
10444 | + } | |
10445 | + } | |
10446 | + | |
10447 | + /* | |
10448 | + * Pack all lkb's on r's queues following fill->lkbqueue. | |
10449 | + */ | |
10450 | + | |
10451 | + if (fill->lkbqueue == &r->res_waitqueue) | |
10452 | + goto out; | |
10453 | + if (fill->lkbqueue == &r->res_convertqueue) | |
10454 | + goto skip; | |
10455 | + | |
10456 | + GDLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,); | |
10457 | + | |
10458 | + error = pack_lkb_queue(r, &r->res_convertqueue, fill); | |
10459 | + if (error) | |
10460 | + goto out; | |
10461 | + skip: | |
10462 | + error = pack_lkb_queue(r, &r->res_waitqueue, fill); | |
10463 | + | |
10464 | + out: | |
10465 | + return error; | |
10466 | +} | |
10467 | + | |
10468 | +static int pack_one_subrsb(gd_res_t *rsb, gd_res_t *subrsb, rcom_fill_t *fill) | |
10469 | +{ | |
10470 | + int error; | |
10471 | + | |
10472 | + down_write(&subrsb->res_lock); | |
10473 | + | |
10474 | + if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen) | |
10475 | + goto nospace; | |
10476 | + | |
10477 | + subrsb->res_nodeid = rsb->res_nodeid; | |
10478 | + subrsb->res_remasterid = ++fill->remasterid; | |
10479 | + | |
10480 | + put_char(REMASTER_RSB, fill->outbuf, &fill->offset); | |
10481 | + serialise_rsb(subrsb, fill->outbuf, &fill->offset); | |
10482 | + | |
10483 | + error = pack_lkb_queues(subrsb, fill); | |
10484 | + if (error) | |
10485 | + goto nospace; | |
10486 | + | |
10487 | + up_write(&subrsb->res_lock); | |
10488 | + | |
10489 | + return 0; | |
10490 | + | |
10491 | + nospace: | |
10492 | + up_write(&subrsb->res_lock); | |
10493 | + fill->subrsb = subrsb; | |
10494 | + | |
10495 | + return -ENOSPC; | |
10496 | +} | |
10497 | + | |
10498 | +static int pack_subrsbs(gd_res_t *rsb, gd_res_t *in_subrsb, rcom_fill_t *fill) | |
10499 | +{ | |
10500 | + gd_res_t *subrsb; | |
10501 | + int error = 0; | |
10502 | + | |
10503 | + /* | |
10504 | + * When an initial subrsb is given, we know it needs to be packed. | |
10505 | + * When no initial subrsb is given, begin with the first (if any exist). | |
10506 | + */ | |
10507 | + | |
10508 | + if (!in_subrsb) { | |
10509 | + if (list_empty(&rsb->res_subreslist)) | |
10510 | + goto out; | |
10511 | + | |
10512 | + subrsb = list_entry(rsb->res_subreslist.next, gd_res_t, | |
10513 | + res_subreslist); | |
10514 | + } else | |
10515 | + subrsb = in_subrsb; | |
10516 | + | |
10517 | + for (;;) { | |
10518 | + error = pack_one_subrsb(rsb, subrsb, fill); | |
10519 | + if (error) | |
10520 | + goto out; | |
10521 | + | |
10522 | + if (last_in_list(subrsb, &rsb->res_subreslist)) | |
10523 | + break; | |
10524 | + | |
10525 | + subrsb = next_subrsb(subrsb); | |
10526 | + } | |
10527 | + | |
10528 | + out: | |
10529 | + return error; | |
10530 | +} | |
10531 | + | |
10532 | +/* | |
10533 | + * Finish packing whatever is left in an rsb tree. If space runs out while | |
10534 | + * finishing, save subrsb/lkb and this will be called again for the same rsb. | |
10535 | + * | |
10536 | + * !subrsb && lkb, we left off part way through root rsb's lkbs. | |
10537 | + * subrsb && !lkb, we left off just before starting a new subrsb. | |
10538 | + * subrsb && lkb, we left off part way through a subrsb's lkbs. | |
10539 | + * !subrsb && !lkb, we shouldn't be in this function, but starting | |
10540 | + * a new rsb in pack_rsb_tree(). | |
10541 | + */ | |
10542 | + | |
10543 | +static int pack_rsb_tree_remaining(gd_ls_t *ls, gd_res_t *rsb, | |
10544 | + rcom_fill_t *fill) | |
10545 | +{ | |
10546 | + gd_res_t *subrsb = NULL; | |
10547 | + int error = 0; | |
10548 | + | |
10549 | + if (!fill->subrsb && fill->lkb) { | |
10550 | + error = pack_lkb_remaining(rsb, fill); | |
10551 | + if (error) | |
10552 | + goto out; | |
10553 | + | |
10554 | + error = pack_subrsbs(rsb, NULL, fill); | |
10555 | + if (error) | |
10556 | + goto out; | |
10557 | + } | |
10558 | + | |
10559 | + else if (fill->subrsb && !fill->lkb) { | |
10560 | + error = pack_subrsbs(rsb, fill->subrsb, fill); | |
10561 | + if (error) | |
10562 | + goto out; | |
10563 | + } | |
10564 | + | |
10565 | + else if (fill->subrsb && fill->lkb) { | |
10566 | + error = pack_lkb_remaining(fill->subrsb, fill); | |
10567 | + if (error) | |
10568 | + goto out; | |
10569 | + | |
10570 | + if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist)) | |
10571 | + goto out; | |
10572 | + | |
10573 | + subrsb = next_subrsb(fill->subrsb); | |
10574 | + | |
10575 | + error = pack_subrsbs(rsb, subrsb, fill); | |
10576 | + if (error) | |
10577 | + goto out; | |
10578 | + } | |
10579 | + | |
10580 | + fill->subrsb = NULL; | |
10581 | + fill->lkb = NULL; | |
10582 | + | |
10583 | + out: | |
10584 | + return error; | |
10585 | +} | |
10586 | + | |
10587 | +/* | |
10588 | + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a | |
10589 | + * buffer. When the buffer runs out of space, save the place to restart (the | |
10590 | + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit). | |
10591 | + */ | |
10592 | + | |
10593 | +static int pack_rsb_tree(gd_ls_t *ls, gd_res_t *rsb, rcom_fill_t *fill) | |
10594 | +{ | |
10595 | + int error = -ENOSPC; | |
10596 | + | |
10597 | + fill->remasterid = 0; | |
10598 | + | |
10599 | + /* | |
10600 | + * Pack the root rsb itself. A 1 byte type precedes the serialised | |
10601 | + * rsb. Then pack the lkb's for the root rsb. | |
10602 | + */ | |
10603 | + | |
10604 | + down_write(&rsb->res_lock); | |
10605 | + | |
10606 | + if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen) | |
10607 | + goto out; | |
10608 | + | |
10609 | + rsb->res_remasterid = ++fill->remasterid; | |
10610 | + put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset); | |
10611 | + serialise_rsb(rsb, fill->outbuf, &fill->offset); | |
10612 | + | |
10613 | + error = pack_lkb_queues(rsb, fill); | |
10614 | + if (error) | |
10615 | + goto out; | |
10616 | + | |
10617 | + up_write(&rsb->res_lock); | |
10618 | + | |
10619 | + /* | |
10620 | + * Pack subrsb/lkb's under the root rsb. | |
10621 | + */ | |
10622 | + | |
10623 | + error = pack_subrsbs(rsb, NULL, fill); | |
10624 | + | |
10625 | + return error; | |
10626 | + | |
10627 | + out: | |
10628 | + up_write(&rsb->res_lock); | |
10629 | + return error; | |
10630 | +} | |
10631 | + | |
10632 | +/* | |
10633 | + * Given an RSB, return the next RSB that should be sent to a new master. | |
10634 | + */ | |
10635 | + | |
10636 | +static gd_res_t *next_remastered_rsb(gd_ls_t *ls, gd_res_t *rsb) | |
10637 | +{ | |
10638 | + struct list_head *tmp, *start, *end; | |
10639 | + gd_res_t *r; | |
10640 | + | |
10641 | + if (!rsb) | |
10642 | + start = ls->ls_rootres.next; | |
10643 | + else | |
10644 | + start = rsb->res_rootlist.next; | |
10645 | + | |
10646 | + end = &ls->ls_rootres; | |
10647 | + | |
10648 | + for (tmp = start; tmp != end; tmp = tmp->next) { | |
10649 | + r = list_entry(tmp, gd_res_t, res_rootlist); | |
10650 | + | |
10651 | + if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) { | |
10652 | + if (r->res_nodeid && lkbs_to_remaster(r)) { | |
10653 | + expect_new_lkids(r); | |
10654 | + return r; | |
10655 | + } else | |
10656 | + clear_bit(RESFL_NEW_MASTER, &r->res_flags); | |
10657 | + } | |
10658 | + } | |
10659 | + | |
10660 | + return NULL; | |
10661 | +} | |
10662 | + | |
10663 | +/* | |
10664 | + * Given an rcom buffer, fill it with RSB's that need to be sent to a single | |
10665 | + * new master node. In the case where all the data to send to one node | |
10666 | + * requires multiple messages, this function needs to resume filling each | |
10667 | + * successive buffer from the point where it left off when the previous buffer | |
10668 | + * filled up. | |
10669 | + */ | |
10670 | + | |
10671 | +static void fill_rcom_buffer(gd_ls_t *ls, rcom_fill_t *fill, uint32_t *nodeid) | |
10672 | +{ | |
10673 | + gd_res_t *rsb, *prev_rsb = fill->rsb; | |
10674 | + int error; | |
10675 | + | |
10676 | + fill->offset = 0; | |
10677 | + | |
10678 | + if (!prev_rsb) { | |
10679 | + | |
10680 | + /* | |
10681 | + * The first time this function is called. | |
10682 | + */ | |
10683 | + | |
10684 | + rsb = next_remastered_rsb(ls, NULL); | |
10685 | + if (!rsb) | |
10686 | + goto no_more; | |
10687 | + | |
10688 | + } else if (fill->subrsb || fill->lkb) { | |
10689 | + | |
10690 | + /* | |
10691 | + * Continue packing an rsb tree that was partially packed last | |
10692 | + * time (fill->subrsb/lkb indicates where packing of last block | |
10693 | + * left off) | |
10694 | + */ | |
10695 | + | |
10696 | + rsb = prev_rsb; | |
10697 | + *nodeid = rsb->res_nodeid; | |
10698 | + | |
10699 | + error = pack_rsb_tree_remaining(ls, rsb, fill); | |
10700 | + if (error == -ENOSPC) | |
10701 | + goto more; | |
10702 | + | |
10703 | + rsb = next_remastered_rsb(ls, prev_rsb); | |
10704 | + if (!rsb) | |
10705 | + goto no_more; | |
10706 | + | |
10707 | + if (rsb->res_nodeid != prev_rsb->res_nodeid) | |
10708 | + goto more; | |
10709 | + } else { | |
10710 | + rsb = prev_rsb; | |
10711 | + } | |
10712 | + | |
10713 | + /* | |
10714 | + * Pack rsb trees into the buffer until we run out of space, run out of | |
10715 | + * new rsb's or hit a new nodeid. | |
10716 | + */ | |
10717 | + | |
10718 | + *nodeid = rsb->res_nodeid; | |
10719 | + | |
10720 | + for (;;) { | |
10721 | + error = pack_rsb_tree(ls, rsb, fill); | |
10722 | + if (error == -ENOSPC) | |
10723 | + goto more; | |
10724 | + | |
10725 | + prev_rsb = rsb; | |
10726 | + | |
10727 | + rsb = next_remastered_rsb(ls, prev_rsb); | |
10728 | + if (!rsb) | |
10729 | + goto no_more; | |
10730 | + | |
10731 | + if (rsb->res_nodeid != prev_rsb->res_nodeid) | |
10732 | + goto more; | |
10733 | + } | |
10734 | + | |
10735 | + more: | |
10736 | + fill->more = 1; | |
10737 | + fill->rsb = rsb; | |
10738 | + return; | |
10739 | + | |
10740 | + no_more: | |
10741 | + fill->more = 0; | |
10742 | +} | |
10743 | + | |
10744 | +/* | |
10745 | + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters. | |
10746 | + */ | |
10747 | + | |
10748 | +int rebuild_rsbs_send(gd_ls_t *ls) | |
10749 | +{ | |
10750 | + gd_rcom_t *rc; | |
10751 | + rcom_fill_t fill; | |
10752 | + uint32_t nodeid; | |
10753 | + int error; | |
10754 | + | |
10755 | + GDLM_ASSERT(recover_list_empty(ls),); | |
10756 | + | |
10757 | + log_all(ls, "rebuild locks"); | |
10758 | + | |
10759 | + error = -ENOMEM; | |
10760 | + rc = allocate_rcom_buffer(ls); | |
10761 | + if (!rc) | |
10762 | + goto ret; | |
10763 | + | |
10764 | + error = 0; | |
10765 | + memset(&fill, 0, sizeof(rcom_fill_t)); | |
10766 | + fill.outbuf = rc->rc_buf; | |
10767 | + fill.maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t); | |
10768 | + | |
10769 | + do { | |
10770 | + fill_rcom_buffer(ls, &fill, &nodeid); | |
10771 | + if (!fill.offset) | |
10772 | + break; | |
10773 | + | |
10774 | + rc->rc_datalen = fill.offset; | |
10775 | + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0); | |
10776 | + if (error) | |
10777 | + goto out; | |
10778 | + | |
10779 | + schedule(); | |
10780 | + error = gdlm_recovery_stopped(ls); | |
10781 | + if (error) | |
10782 | + goto out; | |
10783 | + } | |
10784 | + while (fill.more); | |
10785 | + | |
10786 | + error = gdlm_wait_function(ls, &recover_list_empty); | |
10787 | + | |
10788 | + log_all(ls, "rebuilt %d locks", fill.count); | |
10789 | + | |
10790 | + out: | |
10791 | + rebuild_freemem(ls); | |
10792 | + free_rcom_buffer(rc); | |
10793 | + | |
10794 | + ret: | |
10795 | + return error; | |
10796 | +} | |
10797 | + | |
10798 | +static gd_res_t *find_by_remasterid(gd_ls_t *ls, int remasterid, | |
10799 | + gd_res_t *rootrsb) | |
10800 | +{ | |
10801 | + gd_res_t *rsb; | |
10802 | + | |
10803 | + GDLM_ASSERT(rootrsb,); | |
10804 | + | |
10805 | + if (rootrsb->res_remasterid == remasterid) { | |
10806 | + rsb = rootrsb; | |
10807 | + goto out; | |
10808 | + } | |
10809 | + | |
10810 | + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) { | |
10811 | + if (rsb->res_remasterid == remasterid) | |
10812 | + goto out; | |
10813 | + } | |
10814 | + rsb = NULL; | |
10815 | + | |
10816 | + out: | |
10817 | + return rsb; | |
10818 | +} | |
10819 | + | |
10820 | +/* | |
10821 | + * Search a queue for the given remote lock id (remlkid). | |
10822 | + */ | |
10823 | + | |
10824 | +static gd_lkb_t *search_remlkid(struct list_head *statequeue, int nodeid, | |
10825 | + int remid) | |
10826 | +{ | |
10827 | + gd_lkb_t *lkb; | |
10828 | + | |
10829 | + list_for_each_entry(lkb, statequeue, lkb_statequeue) { | |
10830 | + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) { | |
10831 | + return lkb; | |
10832 | + } | |
10833 | + } | |
10834 | + | |
10835 | + return NULL; | |
10836 | +} | |
10837 | + | |
10838 | +/* | |
10839 | + * Given a remote lock ID (and a parent resource), return the local LKB for it | |
10840 | + * Hopefully we dont need to do this too often on deep lock trees. This is | |
10841 | + * VERY suboptimal for anything but the smallest lock trees. It searches the | |
10842 | + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and | |
10843 | + * returns the LKB address. OPTIMISATION: we should keep a list of these while | |
10844 | + * we are building up the remastered LKBs | |
10845 | + */ | |
10846 | + | |
10847 | +static gd_lkb_t *find_by_remlkid(gd_res_t *rootrsb, int nodeid, int remid) | |
10848 | +{ | |
10849 | + gd_lkb_t *lkb; | |
10850 | + gd_res_t *rsb; | |
10851 | + | |
10852 | + lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid); | |
10853 | + if (lkb) | |
10854 | + goto out; | |
10855 | + | |
10856 | + lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid); | |
10857 | + if (lkb) | |
10858 | + goto out; | |
10859 | + | |
10860 | + lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid); | |
10861 | + if (lkb) | |
10862 | + goto out; | |
10863 | + | |
10864 | + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) { | |
10865 | + lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid); | |
10866 | + if (lkb) | |
10867 | + goto out; | |
10868 | + | |
10869 | + lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid); | |
10870 | + if (lkb) | |
10871 | + goto out; | |
10872 | + | |
10873 | + lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid); | |
10874 | + if (lkb) | |
10875 | + goto out; | |
10876 | + } | |
10877 | + lkb = NULL; | |
10878 | + | |
10879 | + out: | |
10880 | + return lkb; | |
10881 | +} | |
10882 | + | |
10883 | +/* | |
10884 | + * Unpack an LKB from a remaster operation | |
10885 | + */ | |
10886 | + | |
10887 | +static int deserialise_lkb(gd_ls_t *ls, int rem_nodeid, gd_res_t *rootrsb, | |
10888 | + char *buf, int *ptr, char *outbuf, int *outoffp) | |
10889 | +{ | |
10890 | + gd_lkb_t *lkb; | |
10891 | + gd_res_t *rsb; | |
10892 | + int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp; | |
10893 | + | |
10894 | + remote_lkid = get_int(buf, ptr); | |
10895 | + | |
10896 | + rsb_rmid = get_int(buf, ptr); | |
10897 | + rsb = find_by_remasterid(ls, rsb_rmid, rootrsb); | |
10898 | + GDLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid);); | |
10899 | + | |
10900 | + /* | |
10901 | + * We could have received this lkb already from a previous recovery | |
10902 | + * that was interrupted. If so, just return the lkid to the remote | |
10903 | + * node. | |
10904 | + */ | |
10905 | + lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid); | |
10906 | + if (lkb) | |
10907 | + goto put_lkid; | |
10908 | + | |
10909 | + lkb = create_lkb(rsb->res_ls); | |
10910 | + if (!lkb) | |
10911 | + goto out; | |
10912 | + | |
10913 | + lkb->lkb_remid = remote_lkid; | |
10914 | + lkb->lkb_flags = get_int(buf, ptr); | |
10915 | + status = get_int(buf, ptr); | |
10916 | + lkb->lkb_rqmode = get_char(buf, ptr); | |
10917 | + lkb->lkb_grmode = get_char(buf, ptr); | |
10918 | + atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr)); | |
10919 | + | |
10920 | + parentid = get_int(buf, ptr); | |
10921 | + lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr); | |
10922 | + | |
10923 | + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { | |
10924 | + lkb->lkb_lvbptr = allocate_lvb(ls); | |
10925 | + if (!lkb->lkb_lvbptr) | |
10926 | + goto out; | |
10927 | + get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr); | |
10928 | + } | |
10929 | + | |
10930 | + if (lkb->lkb_flags & GDLM_LKFLG_RANGE) { | |
10931 | + uint64_t start, end; | |
10932 | + | |
10933 | + /* Don't need to keep the range flag, for comms use only */ | |
10934 | + lkb->lkb_flags &= ~GDLM_LKFLG_RANGE; | |
10935 | + start = get_int64(buf, ptr); | |
10936 | + end = get_int64(buf, ptr); | |
10937 | + | |
10938 | + lkb->lkb_range = allocate_range(rsb->res_ls); | |
10939 | + if (!lkb->lkb_range) | |
10940 | + goto out; | |
10941 | + | |
10942 | + switch (status) { | |
10943 | + case GDLM_LKSTS_CONVERT: | |
10944 | + lkb->lkb_range[RQ_RANGE_START] = start; | |
10945 | + lkb->lkb_range[RQ_RANGE_END] = end; | |
10946 | + start = get_int64(buf, ptr); | |
10947 | + end = get_int64(buf, ptr); | |
10948 | + lkb->lkb_range[GR_RANGE_START] = start; | |
10949 | + lkb->lkb_range[GR_RANGE_END] = end; | |
10950 | + | |
10951 | + case GDLM_LKSTS_WAITING: | |
10952 | + lkb->lkb_range[RQ_RANGE_START] = start; | |
10953 | + lkb->lkb_range[RQ_RANGE_END] = end; | |
10954 | + break; | |
10955 | + | |
10956 | + case GDLM_LKSTS_GRANTED: | |
10957 | + lkb->lkb_range[GR_RANGE_START] = start; | |
10958 | + lkb->lkb_range[GR_RANGE_END] = end; | |
10959 | + break; | |
10960 | + default: | |
10961 | + GDLM_ASSERT(0,); | |
10962 | + } | |
10963 | + } | |
10964 | + | |
10965 | + /* Resolve local lock LKB address from parent ID */ | |
10966 | + if (parentid) | |
10967 | + lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid, | |
10968 | + parentid); | |
10969 | + | |
10970 | + atomic_inc(&rsb->res_ref); | |
10971 | + lkb->lkb_resource = rsb; | |
10972 | + | |
10973 | + lkb->lkb_flags |= GDLM_LKFLG_MSTCPY; | |
10974 | + lkb->lkb_nodeid = rem_nodeid; | |
10975 | + | |
10976 | + /* | |
10977 | + * Put the lkb on an RSB queue. An lkb that's in the midst of a | |
10978 | + * conversion request (on the requesting node's lockqueue and has | |
10979 | + * LQCONVERT set) should be put on the granted queue. The convert | |
10980 | + * request will be resent by the requesting node. | |
10981 | + */ | |
10982 | + | |
10983 | + if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) { | |
10984 | + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT; | |
10985 | + GDLM_ASSERT(status == GDLM_LKSTS_CONVERT, | |
10986 | + printk("status=%d\n", status);); | |
10987 | + lkb->lkb_rqmode = DLM_LOCK_IV; | |
10988 | + status = GDLM_LKSTS_GRANTED; | |
10989 | + } | |
10990 | + | |
10991 | + lkb_enqueue(rsb, lkb, status); | |
10992 | + | |
10993 | + /* | |
10994 | + * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL). | |
10995 | + */ | |
10996 | + | |
10997 | + if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK) | |
10998 | + && lkb->lkb_grmode > DLM_LOCK_NL) { | |
10999 | + if (!rsb->res_lvbptr) | |
11000 | + rsb->res_lvbptr = allocate_lvb(rsb->res_ls); | |
11001 | + if (!rsb->res_lvbptr) | |
11002 | + goto out; | |
11003 | + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); | |
11004 | + } | |
11005 | + | |
11006 | + /* | |
11007 | + * Clear flags that may have been sent over that are only relevant in | |
11008 | + * the context of the sender. | |
11009 | + */ | |
11010 | + | |
5cdbd17b AM |
11011 | + lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND | |
11012 | + GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED); | |
4bf12011 | 11013 | + |
11014 | + put_lkid: | |
11015 | + /* Return the new LKID to the caller's buffer */ | |
11016 | + put_int(lkb->lkb_id, outbuf, outoffp); | |
11017 | + put_int(lkb->lkb_remid, outbuf, outoffp); | |
11018 | + error = 0; | |
11019 | + | |
11020 | + out: | |
11021 | + return error; | |
11022 | +} | |
11023 | + | |
11024 | +static gd_res_t *deserialise_rsb(gd_ls_t *ls, int nodeid, gd_res_t *rootrsb, | |
11025 | + char *buf, int *ptr) | |
11026 | +{ | |
11027 | + int length; | |
11028 | + int remasterid; | |
11029 | + int parent_remasterid; | |
11030 | + char name[DLM_RESNAME_MAXLEN]; | |
11031 | + int error; | |
11032 | + gd_res_t *parent = NULL; | |
11033 | + gd_res_t *rsb; | |
11034 | + | |
11035 | + get_bytes(name, &length, buf, ptr); | |
11036 | + remasterid = get_int(buf, ptr); | |
11037 | + parent_remasterid = get_int(buf, ptr); | |
11038 | + | |
11039 | + if (parent_remasterid) | |
11040 | + parent = find_by_remasterid(ls, parent_remasterid, rootrsb); | |
11041 | + | |
11042 | + /* | |
11043 | + * The rsb reference from this find_or_create_rsb() will keep the rsb | |
11044 | + * around while we add new lkb's to it from deserialise_lkb. Each of | |
11045 | + * the lkb's will add an rsb reference. The reference added here is | |
11046 | + * removed by release_rsb() after all lkb's are added. | |
11047 | + */ | |
11048 | + | |
11049 | + error = find_or_create_rsb(ls, parent, name, length, 1, &rsb); | |
11050 | + GDLM_ASSERT(!error,); | |
11051 | + | |
11052 | + /* There is a case where the above needs to create the RSB. */ | |
11053 | + if (rsb->res_nodeid == -1) | |
11054 | + rsb->res_nodeid = our_nodeid(); | |
11055 | + | |
11056 | + rsb->res_remasterid = remasterid; | |
11057 | + | |
11058 | + return rsb; | |
11059 | +} | |
11060 | + | |
11061 | +/* | |
11062 | + * Processing at the receiving end of a NEWLOCKS message from a node in | |
11063 | + * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote | |
11064 | + * node whose locks we are now mastering. For a reply we need to send back the | |
11065 | + * new lockids of the remastered locks so that remote ops can find them. | |
11066 | + */ | |
11067 | + | |
11068 | +int rebuild_rsbs_recv(gd_ls_t *ls, int nodeid, char *buf, int len) | |
11069 | +{ | |
11070 | + gd_rcom_t *rc; | |
11071 | + gd_res_t *rsb = NULL; | |
11072 | + rebuild_node_t *rnode; | |
11073 | + char *outbuf; | |
11074 | + int outptr, ptr = 0, error = -ENOMEM; | |
11075 | + | |
11076 | + rnode = find_rebuild_root(ls, nodeid); | |
11077 | + if (!rnode) | |
11078 | + goto out; | |
11079 | + | |
11080 | + /* | |
11081 | + * Allocate a buffer for the reply message which is a list of remote | |
11082 | + * lock IDs and their (new) local lock ids. It will always be big | |
11083 | + * enough to fit <n> ID pairs if it already fit <n> LKBs. | |
11084 | + */ | |
11085 | + | |
11086 | + rc = allocate_rcom_buffer(ls); | |
11087 | + if (!rc) | |
11088 | + goto out; | |
11089 | + outbuf = rc->rc_buf; | |
11090 | + outptr = 0; | |
11091 | + | |
11092 | + /* | |
11093 | + * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're | |
11094 | + * created. Each deserialise_rsb adds an rsb reference that must be | |
11095 | + * removed with release_rsb once all new lkb's for an rsb have been | |
11096 | + * added. | |
11097 | + */ | |
11098 | + | |
11099 | + while (ptr < len) { | |
11100 | + int type; | |
11101 | + | |
11102 | + type = get_char(buf, &ptr); | |
11103 | + | |
11104 | + switch (type) { | |
11105 | + case REMASTER_ROOTRSB: | |
11106 | + if (rsb) | |
11107 | + release_rsb(rsb); | |
11108 | + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf, | |
11109 | + &ptr); | |
11110 | + rnode->rootrsb = rsb; | |
11111 | + break; | |
11112 | + | |
11113 | + case REMASTER_RSB: | |
11114 | + if (rsb) | |
11115 | + release_rsb(rsb); | |
11116 | + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf, | |
11117 | + &ptr); | |
11118 | + break; | |
11119 | + | |
11120 | + case REMASTER_LKB: | |
11121 | + deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr, | |
11122 | + outbuf, &outptr); | |
11123 | + break; | |
11124 | + | |
11125 | + default: | |
11126 | + GDLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d " | |
11127 | + "len=%d\n", type, nodeid, ptr, | |
11128 | + len);); | |
11129 | + } | |
11130 | + } | |
11131 | + | |
11132 | + if (rsb) | |
11133 | + release_rsb(rsb); | |
11134 | + | |
11135 | + /* | |
11136 | + * Reply with the new lock IDs. | |
11137 | + */ | |
11138 | + | |
11139 | + rc->rc_datalen = outptr; | |
11140 | + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0); | |
11141 | + | |
11142 | + free_rcom_buffer(rc); | |
11143 | + | |
11144 | + out: | |
11145 | + return error; | |
11146 | +} | |
11147 | + | |
11148 | +/* | |
11149 | + * Processing for a NEWLOCKIDS message. Called when we get the reply from the | |
11150 | + * new master telling us what the new remote lock IDs are for the remastered | |
11151 | + * locks | |
11152 | + */ | |
11153 | + | |
11154 | +int rebuild_rsbs_lkids_recv(gd_ls_t *ls, int nodeid, char *buf, int len) | |
11155 | +{ | |
11156 | + int offset = 0; | |
11157 | + | |
11158 | + if (len == 1) | |
11159 | + len = 0; | |
11160 | + | |
11161 | + while (offset < len) { | |
11162 | + int remote_id; | |
11163 | + int local_id; | |
11164 | + gd_lkb_t *lkb; | |
11165 | + | |
11166 | + if (offset + 8 > len) { | |
11167 | + log_error(ls, "rebuild_rsbs_lkids_recv: bad data " | |
11168 | + "length nodeid=%d offset=%d len=%d", | |
11169 | + nodeid, offset, len); | |
11170 | + break; | |
11171 | + } | |
11172 | + | |
11173 | + remote_id = get_int(buf, &offset); | |
11174 | + local_id = get_int(buf, &offset); | |
11175 | + | |
11176 | + lkb = find_lock_by_id(ls, local_id); | |
11177 | + if (lkb) { | |
11178 | + lkb->lkb_remid = remote_id; | |
11179 | + have_new_lkid(lkb); | |
11180 | + } else { | |
11181 | + log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid " | |
11182 | + "nodeid=%d id=%x remid=%x offset=%d len=%d", | |
11183 | + nodeid, local_id, remote_id, offset, len); | |
11184 | + } | |
11185 | + } | |
11186 | + | |
11187 | + if (recover_list_empty(ls)) | |
11188 | + wake_up(&ls->ls_wait_general); | |
11189 | + | |
11190 | + return 0; | |
11191 | +} | |
11192 | diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h | |
11193 | --- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 11194 | +++ linux-patched/cluster/dlm/rebuild.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 11195 | @@ -0,0 +1,22 @@ |
11196 | +/****************************************************************************** | |
11197 | +******************************************************************************* | |
11198 | +** | |
11199 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
11200 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
11201 | +** | |
11202 | +** This copyrighted material is made available to anyone wishing to use, | |
11203 | +** modify, copy, or redistribute it subject to the terms and conditions | |
11204 | +** of the GNU General Public License v.2. | |
11205 | +** | |
11206 | +******************************************************************************* | |
11207 | +******************************************************************************/ | |
11208 | + | |
11209 | +#ifndef __REBUILD_DOT_H__ | |
11210 | +#define __REBUILD_DOT_H__ | |
11211 | + | |
11212 | +int rebuild_rsbs_send(gd_ls_t * ls); | |
11213 | +int rebuild_rsbs_recv(gd_ls_t * ls, int nodeid, char *buf, int len); | |
11214 | +int rebuild_rsbs_lkids_recv(gd_ls_t * ls, int nodeid, char *buf, int len); | |
11215 | +int rebuild_freemem(gd_ls_t * ls); | |
11216 | + | |
11217 | +#endif /* __REBUILD_DOT_H__ */ | |
11218 | diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c | |
11219 | --- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 11220 | +++ linux-patched/cluster/dlm/reccomms.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 11221 | @@ -0,0 +1,502 @@ |
11222 | +/****************************************************************************** | |
11223 | +******************************************************************************* | |
11224 | +** | |
11225 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
11226 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
11227 | +** | |
11228 | +** This copyrighted material is made available to anyone wishing to use, | |
11229 | +** modify, copy, or redistribute it subject to the terms and conditions | |
11230 | +** of the GNU General Public License v.2. | |
11231 | +** | |
11232 | +******************************************************************************* | |
11233 | +******************************************************************************/ | |
11234 | + | |
11235 | +#include "dlm_internal.h" | |
11236 | +#include "lowcomms.h" | |
11237 | +#include "midcomms.h" | |
11238 | +#include "reccomms.h" | |
11239 | +#include "nodes.h" | |
11240 | +#include "lockspace.h" | |
11241 | +#include "recover.h" | |
11242 | +#include "dir.h" | |
11243 | +#include "config.h" | |
11244 | +#include "rebuild.h" | |
11245 | +#include "memory.h" | |
11246 | + | |
11247 | +/* Running on the basis that only a single recovery communication will be done | |
11248 | + * at a time per lockspace */ | |
11249 | + | |
11250 | +static void rcom_process_message(gd_ls_t * ls, uint32_t nodeid, gd_rcom_t * rc); | |
11251 | + | |
11252 | +/* | |
11253 | + * Track per-node progress/stats during recovery to help debugging. | |
11254 | + */ | |
11255 | + | |
11256 | +void rcom_log(gd_ls_t *ls, int nodeid, gd_rcom_t *rc, int send) | |
11257 | +{ | |
11258 | + gd_csb_t *csb; | |
11259 | + int found = 0; | |
11260 | + | |
11261 | + list_for_each_entry(csb, &ls->ls_nodes, csb_list) { | |
11262 | + if (csb->csb_node->gn_nodeid == nodeid) { | |
11263 | + found = TRUE; | |
11264 | + break; | |
11265 | + } | |
11266 | + } | |
11267 | + | |
11268 | + if (!found) | |
11269 | + return; | |
11270 | + | |
11271 | + if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) { | |
11272 | + if (send) { | |
11273 | + csb->csb_names_send_count++; | |
11274 | + csb->csb_names_send_msgid = rc->rc_msgid; | |
11275 | + } else { | |
11276 | + csb->csb_names_recv_count++; | |
11277 | + csb->csb_names_recv_msgid = rc->rc_msgid; | |
11278 | + } | |
11279 | + } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) { | |
11280 | + if (send) { | |
11281 | + csb->csb_locks_send_count++; | |
11282 | + csb->csb_locks_send_msgid = rc->rc_msgid; | |
11283 | + } else { | |
11284 | + csb->csb_locks_recv_count++; | |
11285 | + csb->csb_locks_recv_msgid = rc->rc_msgid; | |
11286 | + } | |
11287 | + } | |
11288 | +} | |
11289 | + | |
11290 | +void rcom_log_clear(gd_ls_t *ls) | |
11291 | +{ | |
11292 | + gd_csb_t *csb; | |
11293 | + | |
11294 | + list_for_each_entry(csb, &ls->ls_nodes, csb_list) { | |
11295 | + csb->csb_names_send_count = 0; | |
11296 | + csb->csb_names_send_msgid = 0; | |
11297 | + csb->csb_names_recv_count = 0; | |
11298 | + csb->csb_names_recv_msgid = 0; | |
11299 | + csb->csb_locks_send_count = 0; | |
11300 | + csb->csb_locks_send_msgid = 0; | |
11301 | + csb->csb_locks_recv_count = 0; | |
11302 | + csb->csb_locks_recv_msgid = 0; | |
11303 | + } | |
11304 | +} | |
11305 | + | |
11306 | +static int rcom_response(gd_ls_t *ls) | |
11307 | +{ | |
11308 | + return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags); | |
11309 | +} | |
11310 | + | |
11311 | +/** | |
11312 | + * rcom_send_message - send or request recovery data | |
11313 | + * @ls: the lockspace | |
11314 | + * @nodeid: node to which the message is sent | |
11315 | + * @type: type of recovery message | |
11316 | + * @rc: the rc buffer to send | |
11317 | + * @need_reply: wait for reply if this is set | |
11318 | + * | |
11319 | + * Using this interface | |
11320 | + * i) Allocate an rc buffer: | |
11321 | + * rc = allocate_rcom_buffer(ls); | |
11322 | + * ii) Copy data to send beginning at rc->rc_buf: | |
11323 | + * memcpy(rc->rc_buf, mybuf, mylen); | |
11324 | + * iii) Set rc->rc_datalen to the number of bytes copied in (ii): | |
11325 | + * rc->rc_datalen = mylen | |
11326 | + * iv) Submit the rc to this function: | |
11327 | + * rcom_send_message(rc); | |
11328 | + * | |
11329 | + * The max value of "mylen" is dlm_config.buffer_size - sizeof(gd_rcom_t). If | |
11330 | + * more data must be passed in one send, use rcom_expand_buffer() which | |
11331 | + * incrementally increases the size of the rc buffer by dlm_config.buffer_size | |
11332 | + * bytes. | |
11333 | + * | |
11334 | + * Any data returned for the message (when need_reply is set) will saved in | |
11335 | + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the | |
11336 | + * number of bytes copied into rc->rc_buf. | |
11337 | + * | |
11338 | + * Returns: 0 on success, -EXXX on failure | |
11339 | + */ | |
11340 | + | |
11341 | +int rcom_send_message(gd_ls_t *ls, uint32_t nodeid, int type, gd_rcom_t *rc, | |
11342 | + int need_reply) | |
11343 | +{ | |
11344 | + int error = 0; | |
11345 | + | |
11346 | + if (!rc->rc_datalen) | |
11347 | + rc->rc_datalen = 1; | |
11348 | + | |
11349 | + /* | |
11350 | + * Fill in the header. | |
11351 | + */ | |
11352 | + | |
11353 | + rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE; | |
11354 | + rc->rc_header.rh_lockspace = ls->ls_global_id; | |
11355 | + rc->rc_header.rh_length = sizeof(gd_rcom_t) + rc->rc_datalen - 1; | |
11356 | + rc->rc_subcmd = type; | |
11357 | + rc->rc_msgid = ++ls->ls_rcom_msgid; | |
11358 | + | |
11359 | + rcom_log(ls, nodeid, rc, 1); | |
11360 | + | |
11361 | + /* | |
11362 | + * When a reply is received, the reply data goes back into this buffer. | |
11363 | + * Synchronous rcom requests (need_reply=1) are serialised because of | |
11364 | + * the single ls_rcom. | |
11365 | + */ | |
11366 | + | |
11367 | + if (need_reply) { | |
11368 | + down(&ls->ls_rcom_lock); | |
11369 | + ls->ls_rcom = rc; | |
11370 | + } | |
11371 | + | |
11372 | + /* | |
11373 | + * After sending the message we'll wait at the end of this function to | |
11374 | + * get a reply. The READY flag will be set when the reply has been | |
11375 | + * received and requested data has been copied into | |
11376 | + * ls->ls_rcom->rc_buf; | |
11377 | + */ | |
11378 | + | |
11379 | + GDLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),); | |
11380 | + | |
11381 | + /* | |
11382 | + * The WAIT bit indicates that we're waiting for and willing to accept a | |
11383 | + * reply. Any replies are ignored unless this bit is set. | |
11384 | + */ | |
11385 | + | |
11386 | + set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags); | |
11387 | + | |
11388 | + /* | |
11389 | + * Process the message locally. | |
11390 | + */ | |
11391 | + | |
11392 | + if (nodeid == our_nodeid()) { | |
11393 | + rcom_process_message(ls, nodeid, rc); | |
11394 | + goto out; | |
11395 | + } | |
11396 | + | |
11397 | + /* | |
11398 | + * Send the message. | |
11399 | + */ | |
11400 | + | |
11401 | + log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid); | |
11402 | + | |
11403 | + error = midcomms_send_message(nodeid, (struct gd_req_header *) rc, | |
11404 | + GFP_KERNEL); | |
11405 | + GDLM_ASSERT(error >= 0, printk("error = %d\n", error);); | |
11406 | + error = 0; | |
11407 | + | |
11408 | + /* | |
11409 | + * Wait for a reply. Once a reply is processed from midcomms, the | |
11410 | + * READY bit will be set and we'll be awoken (gdlm_wait_function will | |
11411 | + * return 0). | |
11412 | + */ | |
11413 | + | |
11414 | + if (need_reply) { | |
11415 | + error = gdlm_wait_function(ls, &rcom_response); | |
11416 | + if (error) | |
11417 | + log_debug(ls, "rcom wait error %d", error); | |
11418 | + } | |
11419 | + | |
11420 | + out: | |
11421 | + clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags); | |
11422 | + clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags); | |
11423 | + | |
11424 | + if (need_reply) | |
11425 | + up(&ls->ls_rcom_lock); | |
11426 | + | |
11427 | + return error; | |
11428 | +} | |
11429 | + | |
11430 | +/* | |
11431 | + * Runs in same context as midcomms. | |
11432 | + */ | |
11433 | + | |
11434 | +static void rcom_process_message(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *rc) | |
11435 | +{ | |
11436 | + gd_rcom_t rc_stack; | |
11437 | + gd_rcom_t *reply = NULL; | |
11438 | + gd_resdata_t *rd; | |
11439 | + int status, datalen, maxlen; | |
11440 | + uint32_t be_nodeid; | |
11441 | + | |
11442 | + if (!ls) | |
11443 | + return; | |
11444 | + | |
11445 | + rcom_log(ls, nodeid, rc, 0); | |
11446 | + | |
11447 | + if (gdlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) { | |
11448 | + log_error(ls, "ignoring recovery message %x from %u", | |
11449 | + rc->rc_subcmd, nodeid); | |
11450 | + return; | |
11451 | + } | |
11452 | + | |
11453 | + switch (rc->rc_subcmd) { | |
11454 | + | |
11455 | + case RECCOMM_STATUS: | |
11456 | + | |
11457 | + memset(&rc_stack, 0, sizeof(gd_rcom_t)); | |
11458 | + reply = &rc_stack; | |
11459 | + | |
11460 | + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; | |
11461 | + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; | |
11462 | + reply->rc_subcmd = rc->rc_subcmd; | |
11463 | + reply->rc_msgid = rc->rc_msgid; | |
11464 | + reply->rc_buf[0] = 0; | |
11465 | + | |
11466 | + if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags)) | |
11467 | + reply->rc_buf[0] |= RESDIR_VALID; | |
11468 | + | |
11469 | + if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags)) | |
11470 | + reply->rc_buf[0] |= RESDIR_ALL_VALID; | |
11471 | + | |
11472 | + if (test_bit(LSFL_NODES_VALID, &ls->ls_flags)) | |
11473 | + reply->rc_buf[0] |= NODES_VALID; | |
11474 | + | |
11475 | + if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags)) | |
11476 | + reply->rc_buf[0] |= NODES_ALL_VALID; | |
11477 | + | |
11478 | + reply->rc_datalen = 1; | |
11479 | + reply->rc_header.rh_length = | |
11480 | + sizeof(gd_rcom_t) + reply->rc_datalen - 1; | |
11481 | + | |
11482 | + log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid); | |
11483 | + break; | |
11484 | + | |
11485 | + case RECCOMM_RECOVERNAMES: | |
11486 | + | |
11487 | + reply = allocate_rcom_buffer(ls); | |
11488 | + GDLM_ASSERT(reply,); | |
11489 | + maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t); | |
11490 | + | |
11491 | + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; | |
11492 | + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; | |
11493 | + reply->rc_subcmd = rc->rc_subcmd; | |
11494 | + reply->rc_msgid = rc->rc_msgid; | |
11495 | + | |
11496 | + /* | |
11497 | + * The other node wants a bunch of resource names. The name of | |
11498 | + * the resource to begin with is in rc->rc_buf. | |
11499 | + */ | |
11500 | + | |
11501 | + datalen = resdir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen, | |
11502 | + reply->rc_buf, maxlen, nodeid); | |
11503 | + | |
11504 | + reply->rc_datalen = datalen; | |
11505 | + reply->rc_header.rh_length = | |
11506 | + sizeof(gd_rcom_t) + reply->rc_datalen - 1; | |
11507 | + | |
11508 | + log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid, | |
11509 | + reply->rc_msgid); | |
11510 | + break; | |
11511 | + | |
11512 | + case RECCOMM_GETMASTER: | |
11513 | + | |
11514 | + reply = allocate_rcom_buffer(ls); | |
11515 | + GDLM_ASSERT(reply,); | |
11516 | + | |
11517 | + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; | |
11518 | + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; | |
11519 | + reply->rc_subcmd = rc->rc_subcmd; | |
11520 | + reply->rc_msgid = rc->rc_msgid; | |
11521 | + | |
11522 | + /* | |
11523 | + * The other node wants to know the master of a named resource. | |
11524 | + */ | |
11525 | + | |
11526 | + status = get_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, | |
11527 | + &rd, 1); | |
11528 | + if (status != 0) { | |
11529 | + free_rcom_buffer(reply); | |
11530 | + reply = NULL; | |
11531 | + return; | |
11532 | + } | |
11533 | + be_nodeid = cpu_to_be32(rd->rd_master_nodeid); | |
11534 | + memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t)); | |
11535 | + reply->rc_datalen = sizeof(uint32_t); | |
11536 | + reply->rc_header.rh_length = | |
11537 | + sizeof(gd_rcom_t) + reply->rc_datalen - 1; | |
11538 | + break; | |
11539 | + | |
11540 | + case RECCOMM_BULKLOOKUP: | |
11541 | + | |
11542 | + reply = allocate_rcom_buffer(ls); | |
11543 | + GDLM_ASSERT(reply,); | |
11544 | + | |
11545 | + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; | |
11546 | + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; | |
11547 | + reply->rc_subcmd = rc->rc_subcmd; | |
11548 | + reply->rc_msgid = rc->rc_msgid; | |
11549 | + | |
11550 | + /* | |
11551 | + * This is a bulk version of the above and just returns a | |
11552 | + * buffer full of node ids to match the resources | |
11553 | + */ | |
11554 | + | |
11555 | + datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf, | |
11556 | + rc->rc_datalen, reply->rc_buf); | |
11557 | + if (datalen < 0) { | |
11558 | + free_rcom_buffer(reply); | |
11559 | + reply = NULL; | |
11560 | + return; | |
11561 | + } | |
11562 | + | |
11563 | + reply->rc_datalen = datalen; | |
11564 | + reply->rc_header.rh_length = | |
11565 | + sizeof(gd_rcom_t) + reply->rc_datalen - 1; | |
11566 | + break; | |
11567 | + | |
11568 | + /* | |
11569 | + * These RECCOMM messages don't need replies. | |
11570 | + */ | |
11571 | + | |
11572 | + case RECCOMM_NEWLOCKS: | |
11573 | + rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen); | |
11574 | + break; | |
11575 | + | |
11576 | + case RECCOMM_NEWLOCKIDS: | |
11577 | + rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen); | |
11578 | + break; | |
11579 | + | |
11580 | + case RECCOMM_REMRESDATA: | |
11581 | + remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, 1); | |
11582 | + break; | |
11583 | + | |
11584 | + default: | |
11585 | + GDLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd);); | |
11586 | + } | |
11587 | + | |
11588 | + if (reply) { | |
11589 | + if (nodeid == our_nodeid()) { | |
11590 | + GDLM_ASSERT(rc == ls->ls_rcom,); | |
11591 | + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen); | |
11592 | + rc->rc_datalen = reply->rc_datalen; | |
11593 | + } else { | |
11594 | + midcomms_send_message(nodeid, | |
11595 | + (struct gd_req_header *) reply, | |
11596 | + GFP_KERNEL); | |
11597 | + } | |
11598 | + | |
11599 | + if (reply != &rc_stack) | |
11600 | + free_rcom_buffer(reply); | |
11601 | + } | |
11602 | +} | |
11603 | + | |
11604 | +static void process_reply_sync(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply) | |
11605 | +{ | |
11606 | + gd_rcom_t *rc = ls->ls_rcom; | |
11607 | + | |
11608 | + if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) { | |
11609 | + log_error(ls, "unexpected rcom reply nodeid=%u", nodeid); | |
11610 | + return; | |
11611 | + } | |
11612 | + | |
11613 | + if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) { | |
11614 | + log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u", | |
11615 | + reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid); | |
11616 | + return; | |
11617 | + } | |
11618 | + | |
11619 | + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen); | |
11620 | + rc->rc_datalen = reply->rc_datalen; | |
11621 | + | |
11622 | + /* | |
11623 | + * Tell the thread waiting in rcom_send_message() that it can go ahead. | |
11624 | + */ | |
11625 | + | |
11626 | + set_bit(LSFL_RECCOMM_READY, &ls->ls_flags); | |
11627 | + wake_up(&ls->ls_wait_general); | |
11628 | +} | |
11629 | + | |
11630 | +static void process_reply_async(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply) | |
11631 | +{ | |
11632 | + restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen, | |
11633 | + reply->rc_msgid); | |
11634 | +} | |
11635 | + | |
11636 | +/* | |
11637 | + * Runs in same context as midcomms. | |
11638 | + */ | |
11639 | + | |
11640 | +static void rcom_process_reply(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply) | |
11641 | +{ | |
11642 | + if (gdlm_recovery_stopped(ls)) { | |
11643 | + log_error(ls, "ignoring recovery reply %x from %u", | |
11644 | + reply->rc_subcmd, nodeid); | |
11645 | + return; | |
11646 | + } | |
11647 | + | |
11648 | + switch (reply->rc_subcmd) { | |
11649 | + case RECCOMM_GETMASTER: | |
11650 | + process_reply_async(ls, nodeid, reply); | |
11651 | + break; | |
11652 | + case RECCOMM_STATUS: | |
11653 | + case RECCOMM_NEWLOCKS: | |
11654 | + case RECCOMM_NEWLOCKIDS: | |
11655 | + case RECCOMM_RECOVERNAMES: | |
11656 | + process_reply_sync(ls, nodeid, reply); | |
11657 | + break; | |
11658 | + default: | |
11659 | + log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u", | |
11660 | + reply->rc_subcmd, nodeid); | |
11661 | + } | |
11662 | +} | |
11663 | + | |
11664 | + | |
11665 | +static int send_ls_not_ready(uint32_t nodeid, struct gd_req_header *header) | |
11666 | +{ | |
11667 | + struct writequeue_entry *wq; | |
11668 | + gd_rcom_t *rc = (gd_rcom_t *) header; | |
11669 | + gd_rcom_t *reply; | |
11670 | + | |
11671 | + wq = lowcomms_get_buffer(nodeid, sizeof(gd_rcom_t), GFP_KERNEL, | |
11672 | + (char **)&reply); | |
11673 | + if (!wq) | |
11674 | + return -ENOMEM; | |
11675 | + | |
11676 | + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; | |
11677 | + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; | |
11678 | + reply->rc_subcmd = rc->rc_subcmd; | |
11679 | + reply->rc_msgid = rc->rc_msgid; | |
11680 | + reply->rc_buf[0] = 0; | |
11681 | + | |
11682 | + reply->rc_datalen = 1; | |
11683 | + reply->rc_header.rh_length = sizeof(gd_rcom_t) + reply->rc_datalen - 1; | |
11684 | + | |
11685 | + midcomms_send_buffer((struct gd_req_header *)reply, wq); | |
11686 | + return 0; | |
11687 | +} | |
11688 | + | |
11689 | + | |
11690 | +/* | |
11691 | + * Runs in same context as midcomms. Both recovery requests and recovery | |
11692 | + * replies come through this function. | |
11693 | + */ | |
11694 | + | |
11695 | +void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header) | |
11696 | +{ | |
11697 | + gd_ls_t *ls = find_lockspace_by_global_id(header->rh_lockspace); | |
11698 | + gd_rcom_t *rc = (gd_rcom_t *) header; | |
11699 | + | |
11700 | + /* If the lockspace doesn't exist then still send a status message | |
11701 | + back, it's possible that it just doesn't have it's global_id | |
11702 | + yet. */ | |
11703 | + if (!ls) { | |
11704 | + send_ls_not_ready(nodeid, header); | |
11705 | + return; | |
11706 | + } | |
11707 | + | |
11708 | + switch (header->rh_cmd) { | |
11709 | + case GDLM_REMCMD_RECOVERMESSAGE: | |
11710 | + down_read(&ls->ls_rec_rsblist); | |
11711 | + rcom_process_message(ls, nodeid, rc); | |
11712 | + up_read(&ls->ls_rec_rsblist); | |
11713 | + break; | |
11714 | + | |
11715 | + case GDLM_REMCMD_RECOVERREPLY: | |
11716 | + rcom_process_reply(ls, nodeid, rc); | |
11717 | + break; | |
11718 | + | |
11719 | + default: | |
11720 | + GDLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd);); | |
11721 | + } | |
11722 | +} | |
11723 | + | |
11724 | diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h | |
11725 | --- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 11726 | +++ linux-patched/cluster/dlm/reccomms.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 11727 | @@ -0,0 +1,37 @@ |
11728 | +/****************************************************************************** | |
11729 | +******************************************************************************* | |
11730 | +** | |
11731 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
11732 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
11733 | +** | |
11734 | +** This copyrighted material is made available to anyone wishing to use, | |
11735 | +** modify, copy, or redistribute it subject to the terms and conditions | |
11736 | +** of the GNU General Public License v.2. | |
11737 | +** | |
11738 | +******************************************************************************* | |
11739 | +******************************************************************************/ | |
11740 | + | |
11741 | +#ifndef __RECCOMMS_DOT_H__ | |
11742 | +#define __RECCOMMS_DOT_H__ | |
11743 | + | |
11744 | +/* Bit flags */ | |
11745 | + | |
11746 | +#define RESDIR_VALID (1) | |
11747 | +#define RESDIR_ALL_VALID (2) | |
11748 | +#define NODES_VALID (4) | |
11749 | +#define NODES_ALL_VALID (8) | |
11750 | + | |
11751 | +#define RECCOMM_STATUS (1) | |
11752 | +#define RECCOMM_RECOVERNAMES (2) | |
11753 | +#define RECCOMM_GETMASTER (3) | |
11754 | +#define RECCOMM_BULKLOOKUP (4) | |
11755 | +#define RECCOMM_NEWLOCKS (5) | |
11756 | +#define RECCOMM_NEWLOCKIDS (6) | |
11757 | +#define RECCOMM_REMRESDATA (7) | |
11758 | + | |
11759 | +int rcom_send_message(gd_ls_t * ls, uint32_t nodeid, int type, gd_rcom_t * rc, | |
11760 | + int need_reply); | |
11761 | +void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header); | |
11762 | +void rcom_log_clear(gd_ls_t *ls); | |
11763 | + | |
11764 | +#endif | |
11765 | diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c | |
11766 | --- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 11767 | +++ linux-patched/cluster/dlm/recover.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 11768 | @@ -0,0 +1,632 @@ |
11769 | +/****************************************************************************** | |
11770 | +******************************************************************************* | |
11771 | +** | |
11772 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
11773 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
11774 | +** | |
11775 | +** This copyrighted material is made available to anyone wishing to use, | |
11776 | +** modify, copy, or redistribute it subject to the terms and conditions | |
11777 | +** of the GNU General Public License v.2. | |
11778 | +** | |
11779 | +******************************************************************************* | |
11780 | +******************************************************************************/ | |
11781 | + | |
11782 | +#include "dlm_internal.h" | |
11783 | +#include "reccomms.h" | |
11784 | +#include "dir.h" | |
11785 | +#include "locking.h" | |
11786 | +#include "rsb.h" | |
11787 | +#include "lockspace.h" | |
11788 | +#include "lkb.h" | |
11789 | +#include "nodes.h" | |
11790 | +#include "config.h" | |
11791 | +#include "ast.h" | |
11792 | +#include "memory.h" | |
11793 | + | |
11794 | +/* | |
11795 | + * Called in recovery routines to check whether the recovery process has been | |
11796 | + * interrupted/stopped by another transition. A recovery in-process will abort | |
11797 | + * if the lockspace is "stopped" so that a new recovery process can start from | |
11798 | + * the beginning when the lockspace is "started" again. | |
11799 | + */ | |
11800 | + | |
11801 | +int gdlm_recovery_stopped(gd_ls_t *ls) | |
11802 | +{ | |
11803 | + return test_bit(LSFL_LS_STOP, &ls->ls_flags); | |
11804 | +} | |
11805 | + | |
11806 | +static void gdlm_wait_timer_fn(unsigned long data) | |
11807 | +{ | |
11808 | + gd_ls_t *ls = (gd_ls_t *) data; | |
11809 | + | |
11810 | + wake_up(&ls->ls_wait_general); | |
11811 | +} | |
11812 | + | |
11813 | +/* | |
11814 | + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP | |
11815 | + * set due to failure of a node in ls_nodes). When another function thinks it | |
11816 | + * could have completed the waited-on task, they should wake up ls_wait_general | |
11817 | + * to get an immediate response rather than waiting for the timer to detect the | |
11818 | + * result. A timer wakes us up periodically while waiting to see if we should | |
11819 | + * abort due to a node failure. | |
11820 | + */ | |
11821 | + | |
11822 | +int gdlm_wait_function(gd_ls_t *ls, int (*testfn) (gd_ls_t * ls)) | |
11823 | +{ | |
11824 | + struct timer_list timer; | |
11825 | + int error = 0; | |
11826 | + | |
11827 | + init_timer(&timer); | |
11828 | + timer.function = gdlm_wait_timer_fn; | |
11829 | + timer.data = (long) ls; | |
11830 | + | |
11831 | + for (;;) { | |
11832 | + mod_timer(&timer, jiffies + (5 * HZ)); | |
11833 | + | |
11834 | + wchan_cond_sleep_intr(ls->ls_wait_general, | |
11835 | + !testfn(ls) && | |
11836 | + !test_bit(LSFL_LS_STOP, &ls->ls_flags)); | |
11837 | + | |
11838 | + if (timer_pending(&timer)) | |
11839 | + del_timer(&timer); | |
11840 | + | |
11841 | + if (testfn(ls)) | |
11842 | + break; | |
11843 | + | |
11844 | + if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) { | |
11845 | + error = -1; | |
11846 | + break; | |
11847 | + } | |
11848 | + } | |
11849 | + | |
11850 | + return error; | |
11851 | +} | |
11852 | + | |
11853 | +int gdlm_wait_status_all(gd_ls_t *ls, unsigned int wait_status) | |
11854 | +{ | |
11855 | + gd_rcom_t rc_stack, *rc; | |
11856 | + gd_csb_t *csb; | |
11857 | + int status; | |
11858 | + int error = 0; | |
11859 | + | |
11860 | + memset(&rc_stack, 0, sizeof(gd_rcom_t)); | |
11861 | + rc = &rc_stack; | |
11862 | + rc->rc_datalen = 0; | |
11863 | + | |
11864 | + list_for_each_entry(csb, &ls->ls_nodes, csb_list) { | |
11865 | + for (;;) { | |
11866 | + error = gdlm_recovery_stopped(ls); | |
11867 | + if (error) | |
11868 | + goto out; | |
11869 | + | |
11870 | + error = rcom_send_message(ls, csb->csb_node->gn_nodeid, | |
11871 | + RECCOMM_STATUS, rc, 1); | |
11872 | + if (error) | |
11873 | + goto out; | |
11874 | + | |
11875 | + status = rc->rc_buf[0]; | |
11876 | + if (status & wait_status) | |
11877 | + break; | |
11878 | + else { | |
11879 | + set_current_state(TASK_INTERRUPTIBLE); | |
11880 | + schedule_timeout(HZ >> 1); | |
11881 | + } | |
11882 | + } | |
11883 | + } | |
11884 | + | |
11885 | + out: | |
11886 | + return error; | |
11887 | +} | |
11888 | + | |
11889 | +int gdlm_wait_status_low(gd_ls_t *ls, unsigned int wait_status) | |
11890 | +{ | |
11891 | + gd_rcom_t rc_stack, *rc; | |
11892 | + uint32_t nodeid = ls->ls_low_nodeid; | |
11893 | + int status; | |
11894 | + int error = 0; | |
11895 | + | |
11896 | + memset(&rc_stack, 0, sizeof(gd_rcom_t)); | |
11897 | + rc = &rc_stack; | |
11898 | + rc->rc_datalen = 0; | |
11899 | + | |
11900 | + for (;;) { | |
11901 | + error = gdlm_recovery_stopped(ls); | |
11902 | + if (error) | |
11903 | + goto out; | |
11904 | + | |
11905 | + error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1); | |
11906 | + if (error) | |
11907 | + break; | |
11908 | + | |
11909 | + status = rc->rc_buf[0]; | |
11910 | + if (status & wait_status) | |
11911 | + break; | |
11912 | + else { | |
11913 | + set_current_state(TASK_INTERRUPTIBLE); | |
11914 | + schedule_timeout(HZ >> 1); | |
11915 | + } | |
11916 | + } | |
11917 | + | |
11918 | + out: | |
11919 | + return error; | |
11920 | +} | |
11921 | + | |
11922 | +static int purge_queue(gd_ls_t *ls, struct list_head *queue) | |
11923 | +{ | |
11924 | + gd_lkb_t *lkb, *safe; | |
11925 | + gd_res_t *rsb; | |
11926 | + int count = 0; | |
11927 | + | |
11928 | + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) { | |
11929 | + if (!lkb->lkb_nodeid) | |
11930 | + continue; | |
11931 | + | |
11932 | + GDLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,); | |
11933 | + | |
11934 | + if (in_nodes_gone(ls, lkb->lkb_nodeid)) { | |
11935 | + list_del(&lkb->lkb_statequeue); | |
11936 | + | |
11937 | + rsb = lkb->lkb_resource; | |
11938 | + lkb->lkb_status = 0; | |
11939 | + | |
11940 | + if (lkb->lkb_status == GDLM_LKSTS_CONVERT | |
11941 | + && &lkb->lkb_duetime) | |
11942 | + remove_from_deadlockqueue(lkb); | |
11943 | + | |
11944 | + release_lkb(ls, lkb); | |
11945 | + release_rsb(rsb); | |
11946 | + count++; | |
11947 | + } | |
11948 | + } | |
11949 | + | |
11950 | + return count; | |
11951 | +} | |
11952 | + | |
11953 | +/* | |
11954 | + * Go through local restbl and for each rsb we're master of, clear out any | |
11955 | + * lkb's held by departed nodes. | |
11956 | + */ | |
11957 | + | |
11958 | +int restbl_lkb_purge(gd_ls_t *ls) | |
11959 | +{ | |
11960 | + struct list_head *tmp2, *safe2; | |
11961 | + int count = 0; | |
11962 | + gd_res_t *rootrsb, *safe, *rsb; | |
11963 | + | |
11964 | + log_all(ls, "purge locks of departed nodes"); | |
11965 | + | |
11966 | + list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) { | |
11967 | + | |
11968 | + rootrsb->res_resdir_seq = 1; | |
11969 | + | |
11970 | + if (rootrsb->res_nodeid) | |
11971 | + continue; | |
11972 | + | |
11973 | + hold_rsb(rootrsb); | |
11974 | + down_write(&rootrsb->res_lock); | |
11975 | + | |
11976 | + /* This traverses the subreslist in reverse order so we purge | |
11977 | + * the children before their parents. */ | |
11978 | + | |
11979 | + for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev; | |
11980 | + tmp2 != &rootrsb->res_subreslist; | |
11981 | + tmp2 = safe2, safe2 = safe2->prev) { | |
11982 | + rsb = list_entry(tmp2, gd_res_t, res_subreslist); | |
11983 | + | |
11984 | + hold_rsb(rsb); | |
11985 | + purge_queue(ls, &rsb->res_grantqueue); | |
11986 | + purge_queue(ls, &rsb->res_convertqueue); | |
11987 | + purge_queue(ls, &rsb->res_waitqueue); | |
11988 | + release_rsb(rsb); | |
11989 | + } | |
11990 | + count += purge_queue(ls, &rootrsb->res_grantqueue); | |
11991 | + count += purge_queue(ls, &rootrsb->res_convertqueue); | |
11992 | + count += purge_queue(ls, &rootrsb->res_waitqueue); | |
11993 | + | |
11994 | + up_write(&rootrsb->res_lock); | |
11995 | + release_rsb(rootrsb); | |
11996 | + } | |
11997 | + | |
11998 | + log_all(ls, "purged %d locks", count); | |
11999 | + | |
12000 | + return 0; | |
12001 | +} | |
12002 | + | |
12003 | +/* | |
12004 | + * Grant any locks that have become grantable after a purge | |
12005 | + */ | |
12006 | + | |
12007 | +int restbl_grant_after_purge(gd_ls_t *ls) | |
12008 | +{ | |
12009 | + gd_res_t *root, *rsb, *safe; | |
12010 | + int error = 0; | |
12011 | + | |
12012 | + down_write(&ls->ls_gap_rsblist); | |
12013 | + | |
12014 | + list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) { | |
12015 | + /* only the rsb master grants locks */ | |
12016 | + if (root->res_nodeid) | |
12017 | + continue; | |
12018 | + | |
12019 | + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { | |
12020 | + log_debug(ls, "restbl_grant_after_purge aborted"); | |
12021 | + error = -EINTR; | |
12022 | + up_write(&ls->ls_gap_rsblist); | |
12023 | + goto out; | |
12024 | + } | |
12025 | + | |
12026 | + down_write(&root->res_lock); | |
12027 | + grant_pending_locks(root); | |
12028 | + up_write(&root->res_lock); | |
12029 | + | |
12030 | + list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){ | |
12031 | + down_write(&rsb->res_lock); | |
12032 | + grant_pending_locks(rsb); | |
12033 | + up_write(&rsb->res_lock); | |
12034 | + } | |
12035 | + } | |
12036 | + up_write(&ls->ls_gap_rsblist); | |
12037 | + wake_astd(); | |
12038 | + out: | |
12039 | + return error; | |
12040 | +} | |
12041 | + | |
12042 | +/* | |
12043 | + * Set the lock master for all LKBs in a lock queue | |
12044 | + */ | |
12045 | + | |
12046 | +static void set_lock_master(struct list_head *queue, int nodeid) | |
12047 | +{ | |
12048 | + gd_lkb_t *lkb; | |
12049 | + | |
12050 | + list_for_each_entry(lkb, queue, lkb_statequeue) { | |
12051 | + /* Don't muck around with pre-exising sublocks */ | |
12052 | + if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY)) | |
12053 | + lkb->lkb_nodeid = nodeid; | |
12054 | + } | |
12055 | +} | |
12056 | + | |
12057 | +static void set_master_lkbs(gd_res_t *rsb) | |
12058 | +{ | |
12059 | + set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid); | |
12060 | + set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid); | |
12061 | + set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid); | |
12062 | +} | |
12063 | + | |
12064 | +/* | |
12065 | + * This rsb struct is now the master so it is responsible for keeping the | |
12066 | + * latest rsb. Find if any current lkb's have an up to date copy of the lvb to | |
12067 | + * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for | |
12068 | + * this rsb in deserialise_lkb. | |
12069 | + */ | |
12070 | + | |
12071 | +static void set_rsb_lvb(gd_res_t *rsb) | |
12072 | +{ | |
12073 | + gd_lkb_t *lkb; | |
12074 | + | |
12075 | + list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) { | |
12076 | + | |
12077 | + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) && | |
12078 | + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) && | |
12079 | + (lkb->lkb_grmode > DLM_LOCK_NL)) | |
12080 | + { | |
12081 | + if (!rsb->res_lvbptr) | |
12082 | + rsb->res_lvbptr = allocate_lvb(rsb->res_ls); | |
12083 | + | |
12084 | + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); | |
12085 | + return; | |
12086 | + } | |
12087 | + } | |
12088 | + | |
12089 | + list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) { | |
12090 | + | |
12091 | + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) && | |
12092 | + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) && | |
12093 | + (lkb->lkb_grmode > DLM_LOCK_NL)) | |
12094 | + { | |
12095 | + if (!rsb->res_lvbptr) | |
12096 | + rsb->res_lvbptr = allocate_lvb(rsb->res_ls); | |
12097 | + | |
12098 | + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); | |
12099 | + return; | |
12100 | + } | |
12101 | + } | |
12102 | +} | |
12103 | + | |
12104 | +/* | |
12105 | + * Propogate the new master nodeid to locks, subrsbs, sublocks. | |
12106 | + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider. | |
12107 | + */ | |
12108 | + | |
12109 | +static void set_new_master(gd_res_t *rsb) | |
12110 | +{ | |
12111 | + gd_res_t *subrsb; | |
12112 | + | |
12113 | + down_write(&rsb->res_lock); | |
12114 | + | |
12115 | + if (rsb->res_nodeid == our_nodeid()) { | |
12116 | + rsb->res_nodeid = 0; | |
12117 | + set_rsb_lvb(rsb); | |
12118 | + } | |
12119 | + | |
12120 | + set_master_lkbs(rsb); | |
12121 | + | |
12122 | + list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) { | |
12123 | + subrsb->res_nodeid = rsb->res_nodeid; | |
12124 | + set_master_lkbs(subrsb); | |
12125 | + } | |
12126 | + | |
12127 | + up_write(&rsb->res_lock); | |
12128 | + | |
12129 | + set_bit(RESFL_NEW_MASTER, &rsb->res_flags); | |
12130 | +} | |
12131 | + | |
12132 | +/* | |
12133 | + * The recover_list contains all the rsb's for which we've requested the new | |
12134 | + * master nodeid. As replies are returned from the resource directories the | |
12135 | + * rsb's are removed from the list. When the list is empty we're done. | |
12136 | + * | |
12137 | + * The recover_list is later similarly used for all rsb's for which we've sent | |
12138 | + * new lkb's and need to receive new corresponding lkid's. | |
12139 | + */ | |
12140 | + | |
12141 | +int recover_list_empty(gd_ls_t *ls) | |
12142 | +{ | |
12143 | + int empty; | |
12144 | + | |
12145 | + spin_lock(&ls->ls_recover_list_lock); | |
12146 | + empty = list_empty(&ls->ls_recover_list); | |
12147 | + spin_unlock(&ls->ls_recover_list_lock); | |
12148 | + | |
12149 | + return empty; | |
12150 | +} | |
12151 | + | |
12152 | +int recover_list_count(gd_ls_t *ls) | |
12153 | +{ | |
12154 | + int count; | |
12155 | + | |
12156 | + spin_lock(&ls->ls_recover_list_lock); | |
12157 | + count = ls->ls_recover_list_count; | |
12158 | + spin_unlock(&ls->ls_recover_list_lock); | |
12159 | + | |
12160 | + return count; | |
12161 | +} | |
12162 | + | |
12163 | +void recover_list_add(gd_res_t *rsb) | |
12164 | +{ | |
12165 | + gd_ls_t *ls = rsb->res_ls; | |
12166 | + | |
12167 | + spin_lock(&ls->ls_recover_list_lock); | |
12168 | + if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) { | |
12169 | + list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list); | |
12170 | + ls->ls_recover_list_count++; | |
12171 | + hold_rsb(rsb); | |
12172 | + } | |
12173 | + spin_unlock(&ls->ls_recover_list_lock); | |
12174 | +} | |
12175 | + | |
12176 | +void recover_list_del(gd_res_t *rsb) | |
12177 | +{ | |
12178 | + gd_ls_t *ls = rsb->res_ls; | |
12179 | + | |
12180 | + spin_lock(&ls->ls_recover_list_lock); | |
12181 | + clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags); | |
12182 | + list_del(&rsb->res_recover_list); | |
12183 | + ls->ls_recover_list_count--; | |
12184 | + spin_unlock(&ls->ls_recover_list_lock); | |
12185 | + | |
12186 | + release_rsb(rsb); | |
12187 | +} | |
12188 | + | |
12189 | +static gd_res_t *recover_list_find(gd_ls_t *ls, int msgid) | |
12190 | +{ | |
12191 | + gd_res_t *rsb = NULL; | |
12192 | + | |
12193 | + spin_lock(&ls->ls_recover_list_lock); | |
12194 | + | |
12195 | + list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) { | |
12196 | + if (rsb->res_recover_msgid == msgid) | |
12197 | + goto rec_found; | |
12198 | + } | |
12199 | + rsb = NULL; | |
12200 | + | |
12201 | + rec_found: | |
12202 | + spin_unlock(&ls->ls_recover_list_lock); | |
12203 | + return rsb; | |
12204 | +} | |
12205 | + | |
12206 | +#if 0 | |
12207 | +static void recover_list_clear(gd_ls_t *ls) | |
12208 | +{ | |
12209 | + gd_res_t *rsb; | |
12210 | + | |
12211 | + | |
12212 | + spin_lock(&ls->ls_recover_list_lock); | |
12213 | + | |
12214 | + while (!list_empty(&ls->ls_recover_list)) { | |
12215 | + rsb = list_entry(ls->ls_recover_list.next, gd_res_t, | |
12216 | + res_recover_list); | |
12217 | + list_del(&rsb->res_recover_list); | |
12218 | + ls->ls_recover_list_count--; | |
12219 | + } | |
12220 | + spin_unlock(&ls->ls_recover_list_lock); | |
12221 | + | |
12222 | +} | |
12223 | +#endif | |
12224 | + | |
12225 | +#if 0 | |
12226 | +void recover_list_dump(gd_ls_t *ls) | |
12227 | +{ | |
12228 | + struct list_head *tmp; | |
12229 | + gd_res_t *rsb; | |
12230 | + | |
12231 | + spin_lock(&ls->ls_recover_list_lock); | |
12232 | + | |
12233 | + printk("recover_list_count=%d\n", ls->ls_recover_list_count); | |
12234 | + | |
12235 | + list_for_each(tmp, &ls->ls_recover_list) { | |
12236 | + rsb = list_entry(tmp, gd_res_t, res_recover_list); | |
12237 | + gdlm_res_dbprint(rsb); | |
12238 | + } | |
12239 | + spin_unlock(&ls->ls_recover_list_lock); | |
12240 | +} | |
12241 | +#endif | |
12242 | + | |
12243 | +static int rsb_master_lookup(gd_res_t *rsb, gd_rcom_t *rc) | |
12244 | +{ | |
12245 | + gd_ls_t *ls = rsb->res_ls; | |
12246 | + gd_resdata_t *rd; | |
12247 | + uint32_t dir_nodeid; | |
12248 | + int error; | |
12249 | + | |
12250 | + dir_nodeid = get_directory_nodeid(rsb); | |
12251 | + | |
12252 | + if (dir_nodeid == our_nodeid()) { | |
12253 | + error = get_resdata(ls, dir_nodeid, rsb->res_name, | |
12254 | + rsb->res_length, &rd, 1); | |
12255 | + if (error) | |
12256 | + goto fail; | |
12257 | + | |
12258 | + rsb->res_nodeid = rd->rd_master_nodeid; | |
12259 | + set_new_master(rsb); | |
12260 | + } else { | |
12261 | + /* As we are the only thread doing recovery this | |
12262 | + should be safe. if not then we need to use a different | |
12263 | + ID somehow. We must set it in the RSB before rcom_send_msg | |
12264 | + completes cos we may get a reply quite quickly. | |
12265 | + */ | |
12266 | + rsb->res_recover_msgid = ls->ls_rcom_msgid + 1; | |
12267 | + | |
12268 | + recover_list_add(rsb); | |
12269 | + | |
12270 | + memcpy(rc->rc_buf, rsb->res_name, rsb->res_length); | |
12271 | + rc->rc_datalen = rsb->res_length; | |
12272 | + | |
12273 | + error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER, | |
12274 | + rc, 0); | |
12275 | + if (error) | |
12276 | + goto fail; | |
12277 | + } | |
12278 | + | |
12279 | + fail: | |
12280 | + return error; | |
12281 | +} | |
12282 | + | |
12283 | +/* | |
12284 | + * Go through local root resources and for each rsb which has a master which | |
12285 | + * has departed, get the new master nodeid from the resdir. The resdir will | |
12286 | + * assign mastery to the first node to look up the new master. That means | |
12287 | + * we'll discover in this lookup if we're the new master of any rsb's. | |
12288 | + * | |
12289 | + * We fire off all the resdir requests individually and asynchronously to the | |
12290 | + * correct resdir node. The replies are processed in rsb_master_recv(). | |
12291 | + */ | |
12292 | + | |
12293 | +int restbl_rsb_update(gd_ls_t *ls) | |
12294 | +{ | |
12295 | + gd_res_t *rsb, *safe; | |
12296 | + gd_rcom_t *rc; | |
12297 | + int error = -ENOMEM; | |
12298 | + int count = 0; | |
12299 | + | |
12300 | + log_all(ls, "update remastered resources"); | |
12301 | + | |
12302 | + rc = allocate_rcom_buffer(ls); | |
12303 | + if (!rc) | |
12304 | + goto out; | |
12305 | + | |
12306 | + list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) { | |
12307 | + if (!rsb->res_nodeid) | |
12308 | + continue; | |
12309 | + | |
12310 | + error = gdlm_recovery_stopped(ls); | |
12311 | + if (error) | |
12312 | + goto out_free; | |
12313 | + | |
12314 | + if (in_nodes_gone(ls, rsb->res_nodeid)) { | |
12315 | + error = rsb_master_lookup(rsb, rc); | |
12316 | + if (error) | |
12317 | + goto out_free; | |
12318 | + count++; | |
12319 | + } | |
12320 | + } | |
12321 | + | |
12322 | + error = gdlm_wait_function(ls, &recover_list_empty); | |
12323 | + | |
12324 | + log_all(ls, "updated %d resources", count); | |
12325 | + | |
12326 | + out_free: | |
12327 | + free_rcom_buffer(rc); | |
12328 | + | |
12329 | + out: | |
12330 | + return error; | |
12331 | +} | |
12332 | + | |
12333 | +int restbl_rsb_update_recv(gd_ls_t *ls, uint32_t nodeid, char *buf, int length, | |
12334 | + int msgid) | |
12335 | +{ | |
12336 | + gd_res_t *rsb; | |
12337 | + uint32_t be_nodeid; | |
12338 | + | |
12339 | + rsb = recover_list_find(ls, msgid); | |
12340 | + if (!rsb) { | |
12341 | + log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid); | |
12342 | + goto out; | |
12343 | + } | |
12344 | + | |
12345 | + memcpy(&be_nodeid, buf, sizeof(uint32_t)); | |
12346 | + rsb->res_nodeid = be32_to_cpu(be_nodeid); | |
12347 | + set_new_master(rsb); | |
12348 | + recover_list_del(rsb); | |
12349 | + | |
12350 | + if (recover_list_empty(ls)) | |
12351 | + wake_up(&ls->ls_wait_general); | |
12352 | + | |
12353 | + out: | |
12354 | + return 0; | |
12355 | +} | |
12356 | + | |
12357 | +/* | |
12358 | + * This function not used any longer. | |
12359 | + */ | |
12360 | + | |
12361 | +int bulk_master_lookup(gd_ls_t *ls, int nodeid, char *inbuf, int inlen, | |
12362 | + char *outbuf) | |
12363 | +{ | |
12364 | + char *inbufptr, *outbufptr; | |
12365 | + | |
12366 | + /* | |
12367 | + * The other node wants nodeids matching the resource names in inbuf. | |
12368 | + * The resource names are packed into inbuf as | |
12369 | + * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is | |
12370 | + * lenX bytes. Matching nodeids are packed into outbuf in order | |
12371 | + * [nodeid1][nodeid2]... | |
12372 | + */ | |
12373 | + | |
12374 | + inbufptr = inbuf; | |
12375 | + outbufptr = outbuf; | |
12376 | + | |
12377 | + while (inbufptr < inbuf + inlen) { | |
12378 | + gd_resdata_t *rd; | |
12379 | + uint32_t be_nodeid; | |
12380 | + int status; | |
12381 | + | |
12382 | + status = get_resdata(ls, nodeid, inbufptr + 1, *inbufptr, | |
12383 | + &rd, 1); | |
12384 | + if (status != 0) | |
12385 | + goto fail; | |
12386 | + | |
12387 | + inbufptr += *inbufptr + 1; | |
12388 | + | |
12389 | + be_nodeid = cpu_to_be32(rd->rd_master_nodeid); | |
12390 | + memcpy(outbufptr, &be_nodeid, sizeof(uint32_t)); | |
12391 | + outbufptr += sizeof(uint32_t); | |
12392 | + | |
12393 | + /* add assertion that outbufptr - outbuf is not > than ... */ | |
12394 | + } | |
12395 | + | |
12396 | + return (outbufptr - outbuf); | |
12397 | + | |
12398 | + fail: | |
12399 | + return -1; | |
12400 | +} | |
12401 | diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h | |
12402 | --- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 12403 | +++ linux-patched/cluster/dlm/recover.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 12404 | @@ -0,0 +1,34 @@ |
12405 | +/****************************************************************************** | |
12406 | +******************************************************************************* | |
12407 | +** | |
12408 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
12409 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
12410 | +** | |
12411 | +** This copyrighted material is made available to anyone wishing to use, | |
12412 | +** modify, copy, or redistribute it subject to the terms and conditions | |
12413 | +** of the GNU General Public License v.2. | |
12414 | +** | |
12415 | +******************************************************************************* | |
12416 | +******************************************************************************/ | |
12417 | + | |
12418 | +#ifndef __RECOVER_DOT_H__ | |
12419 | +#define __RECOVER_DOT_H__ | |
12420 | + | |
12421 | +int gdlm_wait_function(gd_ls_t * ls, int (*testfn) (gd_ls_t * ls)); | |
12422 | +int gdlm_wait_status_all(gd_ls_t * ls, unsigned int wait_status); | |
12423 | +int gdlm_wait_status_low(gd_ls_t * ls, unsigned int wait_status); | |
12424 | +int gdlm_recovery_stopped(gd_ls_t * ls); | |
12425 | +int recover_list_empty(gd_ls_t * ls); | |
12426 | +int recover_list_count(gd_ls_t * ls); | |
12427 | +void recover_list_add(gd_res_t * rsb); | |
12428 | +void recover_list_del(gd_res_t * rsb); | |
12429 | +void recover_list_dump(gd_ls_t * ls); | |
12430 | +int restbl_lkb_purge(gd_ls_t * ls); | |
12431 | +void restbl_grant_after_purge(gd_ls_t * ls); | |
12432 | +int restbl_rsb_update(gd_ls_t * ls); | |
12433 | +int restbl_rsb_update_recv(gd_ls_t * ls, int nodeid, char *buf, int len, | |
12434 | + int msgid); | |
12435 | +int bulk_master_lookup(gd_ls_t * ls, int nodeid, char *inbuf, int inlen, | |
12436 | + char *outbuf); | |
12437 | + | |
12438 | +#endif /* __RECOVER_DOT_H__ */ | |
12439 | diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c | |
12440 | --- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 12441 | +++ linux-patched/cluster/dlm/recoverd.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 12442 | @@ -0,0 +1,692 @@ |
12443 | +/****************************************************************************** | |
12444 | +******************************************************************************* | |
12445 | +** | |
12446 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
12447 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
12448 | +** | |
12449 | +** This copyrighted material is made available to anyone wishing to use, | |
12450 | +** modify, copy, or redistribute it subject to the terms and conditions | |
12451 | +** of the GNU General Public License v.2. | |
12452 | +** | |
12453 | +******************************************************************************* | |
12454 | +******************************************************************************/ | |
12455 | + | |
12456 | +#include "dlm_internal.h" | |
12457 | +#include "nodes.h" | |
12458 | +#include "dir.h" | |
12459 | +#include "ast.h" | |
12460 | +#include "recover.h" | |
12461 | +#include "lockspace.h" | |
12462 | +#include "lowcomms.h" | |
12463 | +#include "lockqueue.h" | |
12464 | +#include "lkb.h" | |
12465 | +#include "rebuild.h" | |
12466 | + | |
12467 | +/* | |
12468 | + * next_move actions | |
12469 | + */ | |
12470 | + | |
12471 | +#define DO_STOP (1) | |
12472 | +#define DO_START (2) | |
12473 | +#define DO_FINISH (3) | |
12474 | +#define DO_FINISH_STOP (4) | |
12475 | +#define DO_FINISH_START (5) | |
12476 | + | |
12477 | +/* | |
12478 | + * recoverd_flags for thread | |
12479 | + */ | |
12480 | + | |
12481 | +#define THREAD_STOP (0) | |
12482 | + | |
12483 | +/* | |
12484 | + * local thread variables | |
12485 | + */ | |
12486 | + | |
12487 | +static unsigned long recoverd_flags; | |
12488 | +static struct completion recoverd_run; | |
12489 | +static wait_queue_head_t recoverd_wait; | |
12490 | +static struct task_struct *recoverd_task; | |
12491 | + | |
12492 | +/* | |
12493 | + * Queue of lockspaces (gr_recover_t structs) which need to be | |
12494 | + * started/recovered | |
12495 | + */ | |
12496 | + | |
12497 | +static struct list_head recoverd_start_queue; | |
12498 | +static atomic_t recoverd_start_count; | |
12499 | + | |
12500 | +extern struct list_head lslist; | |
12501 | +extern spinlock_t lslist_lock; | |
12502 | + | |
12503 | +void dlm_recoverd_init(void) | |
12504 | +{ | |
12505 | + INIT_LIST_HEAD(&recoverd_start_queue); | |
12506 | + atomic_set(&recoverd_start_count, 0); | |
12507 | + | |
12508 | + init_completion(&recoverd_run); | |
12509 | + init_waitqueue_head(&recoverd_wait); | |
12510 | + memset(&recoverd_flags, 0, sizeof(unsigned long)); | |
12511 | +} | |
12512 | + | |
12513 | +static int enable_locking(gd_ls_t *ls, int event_id) | |
12514 | +{ | |
12515 | + int error = 0; | |
12516 | + | |
12517 | + spin_lock(&ls->ls_recover_lock); | |
12518 | + if (ls->ls_last_stop < event_id) { | |
12519 | + set_bit(LSFL_LS_RUN, &ls->ls_flags); | |
12520 | + up_write(&ls->ls_in_recovery); | |
12521 | + } else { | |
12522 | + error = -EINTR; | |
12523 | + log_debug(ls, "enable_locking: abort %d", event_id); | |
12524 | + } | |
12525 | + spin_unlock(&ls->ls_recover_lock); | |
12526 | + return error; | |
12527 | +} | |
12528 | + | |
12529 | +static int ls_first_start(gd_ls_t *ls, gd_recover_t *gr) | |
12530 | +{ | |
12531 | + int error; | |
12532 | + | |
12533 | + log_all(ls, "recover event %u (first)", gr->gr_event_id); | |
12534 | + | |
12535 | + kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id); | |
12536 | + | |
12537 | + error = ls_nodes_init(ls, gr); | |
12538 | + if (error) { | |
12539 | + log_error(ls, "nodes_init failed %d", error); | |
12540 | + goto out; | |
12541 | + } | |
12542 | + | |
12543 | + error = resdir_rebuild_local(ls); | |
12544 | + if (error) { | |
12545 | + log_error(ls, "resdir_rebuild_local failed %d", error); | |
12546 | + goto out; | |
12547 | + } | |
12548 | + | |
12549 | + error = resdir_rebuild_wait(ls); | |
12550 | + if (error) { | |
12551 | + log_error(ls, "resdir_rebuild_wait failed %d", error); | |
12552 | + goto out; | |
12553 | + } | |
12554 | + | |
12555 | + log_all(ls, "recover event %u done", gr->gr_event_id); | |
12556 | + kcl_start_done(ls->ls_local_id, gr->gr_event_id); | |
12557 | + | |
12558 | + out: | |
12559 | + return error; | |
12560 | +} | |
12561 | + | |
12562 | +/* | |
12563 | + * We are given here a new group of nodes which are in the lockspace. We first | |
12564 | + * figure out the differences in ls membership from when we were last running. | |
12565 | + * If nodes from before are gone, then there will be some lock recovery to do. | |
12566 | + * If there are only nodes which have joined, then there's no lock recovery. | |
12567 | + * | |
12568 | + * note: cman requires an rc to finish starting on an revent (where nodes die) | |
12569 | + * before it allows an sevent (where nodes join) to be processed. This means | |
12570 | + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA | |
12571 | + * joined. | |
12572 | + */ | |
12573 | + | |
12574 | +static int ls_reconfig(gd_ls_t *ls, gd_recover_t *gr) | |
12575 | +{ | |
12576 | + int error, neg = 0; | |
12577 | + | |
12578 | + log_all(ls, "recover event %u", gr->gr_event_id); | |
12579 | + | |
12580 | + /* | |
12581 | + * Add or remove nodes from the lockspace's ls_nodes list. | |
12582 | + */ | |
12583 | + | |
12584 | + error = ls_nodes_reconfig(ls, gr, &neg); | |
12585 | + if (error) { | |
12586 | + log_error(ls, "nodes_reconfig failed %d", error); | |
12587 | + goto fail; | |
12588 | + } | |
12589 | + | |
12590 | + /* | |
12591 | + * Rebuild our own share of the resdir by collecting from all other | |
12592 | + * nodes rsb name/master pairs for which the name hashes to us. | |
12593 | + */ | |
12594 | + | |
12595 | + error = resdir_rebuild_local(ls); | |
12596 | + if (error) { | |
12597 | + log_error(ls, "resdir_rebuild_local failed %d", error); | |
12598 | + goto fail; | |
12599 | + } | |
12600 | + | |
12601 | + /* | |
12602 | + * Purge resdir-related requests that are being held in requestqueue. | |
12603 | + * All resdir requests from before recovery started are invalid now due | |
12604 | + * to the resdir rebuild and will be resent by the requesting nodes. | |
12605 | + */ | |
12606 | + | |
12607 | + purge_requestqueue(ls); | |
12608 | + set_bit(LSFL_REQUEST_WARN, &ls->ls_flags); | |
12609 | + | |
12610 | + /* | |
12611 | + * Wait for all nodes to complete resdir rebuild. | |
12612 | + */ | |
12613 | + | |
12614 | + error = resdir_rebuild_wait(ls); | |
12615 | + if (error) { | |
12616 | + log_error(ls, "resdir_rebuild_wait failed %d", error); | |
12617 | + goto fail; | |
12618 | + } | |
12619 | + | |
12620 | + /* | |
12621 | + * Mark our own lkb's waiting in the lockqueue for remote replies from | |
12622 | + * nodes that are now departed. These will be resent to the new | |
12623 | + * masters in resend_cluster_requests. Also mark resdir lookup | |
12624 | + * requests for resending. | |
12625 | + */ | |
12626 | + | |
12627 | + lockqueue_lkb_mark(ls); | |
12628 | + | |
12629 | + error = gdlm_recovery_stopped(ls); | |
12630 | + if (error) | |
12631 | + goto fail; | |
12632 | + | |
12633 | + if (neg) { | |
12634 | + /* | |
12635 | + * Clear lkb's for departed nodes. This can't fail since it | |
12636 | + * doesn't involve communicating with other nodes. | |
12637 | + */ | |
12638 | + | |
12639 | + down_write(&ls->ls_rec_rsblist); | |
12640 | + restbl_lkb_purge(ls); | |
12641 | + up_write(&ls->ls_rec_rsblist); | |
12642 | + | |
12643 | + down_read(&ls->ls_rec_rsblist); | |
12644 | + | |
12645 | + /* | |
12646 | + * Get new master id's for rsb's of departed nodes. This fails | |
12647 | + * if we can't communicate with other nodes. | |
12648 | + */ | |
12649 | + | |
12650 | + error = restbl_rsb_update(ls); | |
12651 | + if (error) { | |
12652 | + log_error(ls, "restbl_rsb_update failed %d", error); | |
12653 | + goto fail_up; | |
12654 | + } | |
12655 | + | |
12656 | + /* | |
12657 | + * Send our lkb info to new masters. This fails if we can't | |
12658 | + * communicate with a node. | |
12659 | + */ | |
12660 | + | |
12661 | + error = rebuild_rsbs_send(ls); | |
12662 | + if (error) { | |
12663 | + log_error(ls, "rebuild_rsbs_send failed %d", error); | |
12664 | + goto fail_up; | |
12665 | + } | |
12666 | + up_read(&ls->ls_rec_rsblist); | |
12667 | + } | |
12668 | + | |
12669 | + clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags); | |
12670 | + | |
12671 | + log_all(ls, "recover event %u done", gr->gr_event_id); | |
12672 | + kcl_start_done(ls->ls_local_id, gr->gr_event_id); | |
12673 | + return 0; | |
12674 | + | |
12675 | + fail_up: | |
12676 | + up_read(&ls->ls_rec_rsblist); | |
12677 | + fail: | |
12678 | + log_all(ls, "recover event %d error %d", gr->gr_event_id, error); | |
12679 | + return error; | |
12680 | +} | |
12681 | + | |
12682 | +static void clear_finished_nodes(gd_ls_t *ls, int finish_event) | |
12683 | +{ | |
12684 | + gd_csb_t *csb, *safe; | |
12685 | + | |
12686 | + list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, csb_list) { | |
12687 | + if (csb->csb_gone_event <= finish_event) { | |
12688 | + list_del(&csb->csb_list); | |
12689 | + release_csb(csb); | |
12690 | + } | |
12691 | + } | |
12692 | +} | |
12693 | + | |
12694 | +/* | |
12695 | + * Between calls to this routine for a ls, there can be multiple stop/start | |
12696 | + * events from cman where every start but the latest is cancelled by stops. | |
12697 | + * There can only be a single finish from cman because every finish requires us | |
12698 | + * to call start_done. A single finish event could be followed by multiple | |
12699 | + * stop/start events. This routine takes any combination of events from cman | |
12700 | + * and boils them down to one course of action. | |
12701 | + */ | |
12702 | + | |
12703 | +int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out) | |
12704 | +{ | |
12705 | + LIST_HEAD(events); | |
12706 | + unsigned int cmd = 0, stop, start, finish; | |
12707 | + unsigned int last_stop, last_start, last_finish; | |
12708 | + gd_recover_t *gr = NULL, *start_gr = NULL; | |
12709 | + | |
12710 | + /* | |
12711 | + * Grab the current state of cman/sm events. | |
12712 | + */ | |
12713 | + | |
12714 | + spin_lock(&ls->ls_recover_lock); | |
12715 | + | |
12716 | + stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0; | |
12717 | + start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0; | |
12718 | + finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0; | |
12719 | + | |
12720 | + last_stop = ls->ls_last_stop; | |
12721 | + last_start = ls->ls_last_start; | |
12722 | + last_finish = ls->ls_last_finish; | |
12723 | + | |
12724 | + while (!list_empty(&ls->ls_recover)) { | |
12725 | + gr = list_entry(ls->ls_recover.next, gd_recover_t, gr_list); | |
12726 | + list_del(&gr->gr_list); | |
12727 | + list_add_tail(&gr->gr_list, &events); | |
12728 | + } | |
12729 | + spin_unlock(&ls->ls_recover_lock); | |
12730 | + | |
12731 | + log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish, | |
12732 | + last_stop, last_start, last_finish); | |
12733 | + | |
12734 | + /* | |
12735 | + * Toss start events which have since been cancelled. | |
12736 | + */ | |
12737 | + | |
12738 | + while (!list_empty(&events)) { | |
12739 | + GDLM_ASSERT(start,); | |
12740 | + gr = list_entry(events.next, gd_recover_t, gr_list); | |
12741 | + list_del(&gr->gr_list); | |
12742 | + | |
12743 | + if (gr->gr_event_id <= last_stop) { | |
12744 | + log_debug(ls, "move skip event %u", gr->gr_event_id); | |
12745 | + kfree(gr->gr_nodeids); | |
12746 | + free_dlm_recover(gr); | |
12747 | + gr = NULL; | |
12748 | + } else { | |
12749 | + log_debug(ls, "move use event %u", gr->gr_event_id); | |
12750 | + GDLM_ASSERT(!start_gr,); | |
12751 | + start_gr = gr; | |
12752 | + } | |
12753 | + } | |
12754 | + | |
12755 | + /* | |
12756 | + * Eight possible combinations of events. | |
12757 | + */ | |
12758 | + | |
12759 | + /* 0 */ | |
12760 | + if (!stop && !start && !finish) { | |
12761 | + GDLM_ASSERT(!start_gr,); | |
12762 | + cmd = 0; | |
12763 | + goto out; | |
12764 | + } | |
12765 | + | |
12766 | + /* 1 */ | |
12767 | + if (!stop && !start && finish) { | |
12768 | + GDLM_ASSERT(!start_gr,); | |
12769 | + GDLM_ASSERT(last_start > last_stop,); | |
12770 | + GDLM_ASSERT(last_finish == last_start,); | |
12771 | + cmd = DO_FINISH; | |
12772 | + *finish_out = last_finish; | |
12773 | + goto out; | |
12774 | + } | |
12775 | + | |
12776 | + /* 2 */ | |
12777 | + if (!stop && start && !finish) { | |
12778 | + GDLM_ASSERT(start_gr,); | |
12779 | + GDLM_ASSERT(last_start > last_stop,); | |
12780 | + cmd = DO_START; | |
12781 | + *gr_out = start_gr; | |
12782 | + goto out; | |
12783 | + } | |
12784 | + | |
12785 | + /* 3 */ | |
12786 | + if (!stop && start && finish) { | |
12787 | + GDLM_ASSERT(0, printk("finish and start with no stop\n");); | |
12788 | + } | |
12789 | + | |
12790 | + /* 4 */ | |
12791 | + if (stop && !start && !finish) { | |
12792 | + GDLM_ASSERT(!start_gr,); | |
12793 | + GDLM_ASSERT(last_start == last_stop,); | |
12794 | + cmd = DO_STOP; | |
12795 | + goto out; | |
12796 | + } | |
12797 | + | |
12798 | + /* 5 */ | |
12799 | + if (stop && !start && finish) { | |
12800 | + GDLM_ASSERT(!start_gr,); | |
12801 | + GDLM_ASSERT(last_finish == last_start,); | |
12802 | + GDLM_ASSERT(last_stop == last_start,); | |
12803 | + cmd = DO_FINISH_STOP; | |
12804 | + *finish_out = last_finish; | |
12805 | + goto out; | |
12806 | + } | |
12807 | + | |
12808 | + /* 6 */ | |
12809 | + if (stop && start && !finish) { | |
12810 | + if (start_gr) { | |
12811 | + GDLM_ASSERT(last_start > last_stop,); | |
12812 | + cmd = DO_START; | |
12813 | + *gr_out = start_gr; | |
12814 | + } else { | |
12815 | + GDLM_ASSERT(last_stop == last_start,); | |
12816 | + cmd = DO_STOP; | |
12817 | + } | |
12818 | + goto out; | |
12819 | + } | |
12820 | + | |
12821 | + /* 7 */ | |
12822 | + if (stop && start && finish) { | |
12823 | + if (start_gr) { | |
12824 | + GDLM_ASSERT(last_start > last_stop,); | |
12825 | + GDLM_ASSERT(last_start > last_finish,); | |
12826 | + cmd = DO_FINISH_START; | |
12827 | + *finish_out = last_finish; | |
12828 | + *gr_out = start_gr; | |
12829 | + } else { | |
12830 | + GDLM_ASSERT(last_start == last_stop,); | |
12831 | + GDLM_ASSERT(last_start > last_finish,); | |
12832 | + cmd = DO_FINISH_STOP; | |
12833 | + *finish_out = last_finish; | |
12834 | + } | |
12835 | + goto out; | |
12836 | + } | |
12837 | + | |
12838 | + out: | |
12839 | + return cmd; | |
12840 | +} | |
12841 | + | |
12842 | +/* | |
12843 | + * This function decides what to do given every combination of current | |
12844 | + * lockspace state and next lockspace state. | |
12845 | + */ | |
12846 | + | |
12847 | +static void do_ls_recovery(gd_ls_t *ls) | |
12848 | +{ | |
12849 | + gd_recover_t *gr = NULL; | |
12850 | + int error, cur_state, next_state = 0, do_now, finish_event = 0; | |
12851 | + | |
12852 | + do_now = next_move(ls, &gr, &finish_event); | |
12853 | + if (!do_now) | |
12854 | + goto out; | |
12855 | + | |
12856 | + cur_state = ls->ls_state; | |
12857 | + next_state = 0; | |
12858 | + | |
12859 | + GDLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags), | |
12860 | + log_error(ls, "curstate=%d donow=%d", cur_state, do_now);); | |
12861 | + | |
12862 | + /* | |
12863 | + * LSST_CLEAR - we're not in any recovery state. We can get a stop or | |
12864 | + * a stop and start which equates with a START. | |
12865 | + */ | |
12866 | + | |
12867 | + if (cur_state == LSST_CLEAR) { | |
12868 | + switch (do_now) { | |
12869 | + case DO_STOP: | |
12870 | + next_state = LSST_WAIT_START; | |
12871 | + break; | |
12872 | + | |
12873 | + case DO_START: | |
12874 | + error = ls_reconfig(ls, gr); | |
12875 | + if (error) | |
12876 | + next_state = LSST_WAIT_START; | |
12877 | + else | |
12878 | + next_state = LSST_RECONFIG_DONE; | |
12879 | + break; | |
12880 | + | |
12881 | + case DO_FINISH: /* invalid */ | |
12882 | + case DO_FINISH_STOP: /* invalid */ | |
12883 | + case DO_FINISH_START: /* invalid */ | |
12884 | + default: | |
12885 | + GDLM_ASSERT(0,); | |
12886 | + } | |
12887 | + goto out; | |
12888 | + } | |
12889 | + | |
12890 | + /* | |
12891 | + * LSST_WAIT_START - we're not running because of getting a stop or | |
12892 | + * failing a start. We wait in this state for another stop/start or | |
12893 | + * just the next start to begin another reconfig attempt. | |
12894 | + */ | |
12895 | + | |
12896 | + if (cur_state == LSST_WAIT_START) { | |
12897 | + switch (do_now) { | |
12898 | + case DO_STOP: | |
12899 | + break; | |
12900 | + | |
12901 | + case DO_START: | |
12902 | + error = ls_reconfig(ls, gr); | |
12903 | + if (error) | |
12904 | + next_state = LSST_WAIT_START; | |
12905 | + else | |
12906 | + next_state = LSST_RECONFIG_DONE; | |
12907 | + break; | |
12908 | + | |
12909 | + case DO_FINISH: /* invalid */ | |
12910 | + case DO_FINISH_STOP: /* invalid */ | |
12911 | + case DO_FINISH_START: /* invalid */ | |
12912 | + default: | |
12913 | + GDLM_ASSERT(0,); | |
12914 | + } | |
12915 | + goto out; | |
12916 | + } | |
12917 | + | |
12918 | + /* | |
12919 | + * LSST_RECONFIG_DONE - we entered this state after successfully | |
12920 | + * completing ls_reconfig and calling kcl_start_done. We expect to get | |
12921 | + * a finish if everything goes ok. A finish could be followed by stop | |
12922 | + * or stop/start before we get here to check it. Or a finish may never | |
12923 | + * happen, only stop or stop/start. | |
12924 | + */ | |
12925 | + | |
12926 | + if (cur_state == LSST_RECONFIG_DONE) { | |
12927 | + switch (do_now) { | |
12928 | + case DO_FINISH: | |
12929 | + clear_finished_nodes(ls, finish_event); | |
12930 | + next_state = LSST_CLEAR; | |
12931 | + | |
12932 | + error = enable_locking(ls, finish_event); | |
12933 | + if (error) | |
12934 | + break; | |
12935 | + | |
12936 | + error = process_requestqueue(ls); | |
12937 | + if (error) | |
12938 | + break; | |
12939 | + | |
12940 | + error = resend_cluster_requests(ls); | |
12941 | + if (error) | |
12942 | + break; | |
12943 | + | |
12944 | + restbl_grant_after_purge(ls); | |
12945 | + | |
12946 | + log_all(ls, "recover event %u finished", finish_event); | |
12947 | + break; | |
12948 | + | |
12949 | + case DO_STOP: | |
12950 | + next_state = LSST_WAIT_START; | |
12951 | + break; | |
12952 | + | |
12953 | + case DO_FINISH_STOP: | |
12954 | + clear_finished_nodes(ls, finish_event); | |
12955 | + next_state = LSST_WAIT_START; | |
12956 | + break; | |
12957 | + | |
12958 | + case DO_FINISH_START: | |
12959 | + clear_finished_nodes(ls, finish_event); | |
12960 | + /* fall into DO_START */ | |
12961 | + | |
12962 | + case DO_START: | |
12963 | + error = ls_reconfig(ls, gr); | |
12964 | + if (error) | |
12965 | + next_state = LSST_WAIT_START; | |
12966 | + else | |
12967 | + next_state = LSST_RECONFIG_DONE; | |
12968 | + break; | |
12969 | + | |
12970 | + default: | |
12971 | + GDLM_ASSERT(0,); | |
12972 | + } | |
12973 | + goto out; | |
12974 | + } | |
12975 | + | |
12976 | + /* | |
12977 | + * LSST_INIT - state after ls is created and before it has been | |
12978 | + * started. A start operation will cause the ls to be started for the | |
12979 | + * first time. A failed start will cause to just wait in INIT for | |
12980 | + * another stop/start. | |
12981 | + */ | |
12982 | + | |
12983 | + if (cur_state == LSST_INIT) { | |
12984 | + switch (do_now) { | |
12985 | + case DO_START: | |
12986 | + error = ls_first_start(ls, gr); | |
12987 | + if (!error) | |
12988 | + next_state = LSST_INIT_DONE; | |
12989 | + break; | |
12990 | + | |
12991 | + case DO_STOP: | |
12992 | + break; | |
12993 | + | |
12994 | + case DO_FINISH: /* invalid */ | |
12995 | + case DO_FINISH_STOP: /* invalid */ | |
12996 | + case DO_FINISH_START: /* invalid */ | |
12997 | + default: | |
12998 | + GDLM_ASSERT(0,); | |
12999 | + } | |
13000 | + goto out; | |
13001 | + } | |
13002 | + | |
13003 | + /* | |
13004 | + * LSST_INIT_DONE - after the first start operation is completed | |
13005 | + * successfully and kcl_start_done() called. If there are no errors, a | |
13006 | + * finish will arrive next and we'll move to LSST_CLEAR. | |
13007 | + */ | |
13008 | + | |
13009 | + if (cur_state == LSST_INIT_DONE) { | |
13010 | + switch (do_now) { | |
13011 | + case DO_STOP: | |
13012 | + case DO_FINISH_STOP: | |
13013 | + next_state = LSST_WAIT_START; | |
13014 | + break; | |
13015 | + | |
13016 | + case DO_START: | |
13017 | + case DO_FINISH_START: | |
13018 | + error = ls_reconfig(ls, gr); | |
13019 | + if (error) | |
13020 | + next_state = LSST_WAIT_START; | |
13021 | + else | |
13022 | + next_state = LSST_RECONFIG_DONE; | |
13023 | + break; | |
13024 | + | |
13025 | + case DO_FINISH: | |
13026 | + next_state = LSST_CLEAR; | |
13027 | + enable_locking(ls, finish_event); | |
13028 | + log_all(ls, "recover event %u finished", finish_event); | |
13029 | + break; | |
13030 | + | |
13031 | + default: | |
13032 | + GDLM_ASSERT(0,); | |
13033 | + } | |
13034 | + goto out; | |
13035 | + } | |
13036 | + | |
13037 | + out: | |
13038 | + if (next_state) | |
13039 | + ls->ls_state = next_state; | |
13040 | + | |
13041 | + if (gr) { | |
13042 | + kfree(gr->gr_nodeids); | |
13043 | + free_dlm_recover(gr); | |
13044 | + } | |
13045 | +} | |
13046 | + | |
13047 | +static __inline__ gd_ls_t *get_work(int clear) | |
13048 | +{ | |
13049 | + gd_ls_t *ls; | |
13050 | + | |
13051 | + spin_lock(&lslist_lock); | |
13052 | + | |
13053 | + list_for_each_entry(ls, &lslist, ls_list) { | |
13054 | + if (clear) { | |
13055 | + if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) | |
13056 | + goto got_work; | |
13057 | + | |
13058 | + } else { | |
13059 | + if (test_bit(LSFL_WORK, &ls->ls_flags)) | |
13060 | + goto got_work; | |
13061 | + } | |
13062 | + } | |
13063 | + ls = NULL; | |
13064 | + | |
13065 | + got_work: | |
13066 | + spin_unlock(&lslist_lock); | |
13067 | + | |
13068 | + return ls; | |
13069 | +} | |
13070 | + | |
13071 | +/* | |
13072 | + * Thread which does recovery for all lockspaces. | |
13073 | + */ | |
13074 | + | |
13075 | +static int dlm_recoverd(void *arg) | |
13076 | +{ | |
13077 | + gd_ls_t *ls; | |
13078 | + | |
13079 | + daemonize("dlm_recoverd"); | |
13080 | + recoverd_task = current; | |
13081 | + complete(&recoverd_run); | |
13082 | + | |
13083 | + while (!test_bit(THREAD_STOP, &recoverd_flags)) { | |
13084 | + wchan_cond_sleep_intr(recoverd_wait, !get_work(0)); | |
13085 | + if ((ls = get_work(1))) | |
13086 | + do_ls_recovery(ls); | |
13087 | + } | |
13088 | + | |
13089 | + complete(&recoverd_run); | |
13090 | + return 0; | |
13091 | +} | |
13092 | + | |
13093 | +/* | |
13094 | + * Mark a specific lockspace as needing work and wake up the thread to do it. | |
13095 | + */ | |
13096 | + | |
13097 | +void recoverd_kick(gd_ls_t *ls) | |
13098 | +{ | |
13099 | + set_bit(LSFL_WORK, &ls->ls_flags); | |
13100 | + wake_up(&recoverd_wait); | |
13101 | +} | |
13102 | + | |
13103 | +/* | |
13104 | + * Start the recoverd thread when gdlm is started (before any lockspaces). | |
13105 | + */ | |
13106 | + | |
13107 | +int recoverd_start(void) | |
13108 | +{ | |
13109 | + int error; | |
13110 | + | |
13111 | + clear_bit(THREAD_STOP, &recoverd_flags); | |
13112 | + error = kernel_thread(dlm_recoverd, NULL, 0); | |
13113 | + if (error < 0) | |
13114 | + goto out; | |
13115 | + | |
13116 | + error = 0; | |
13117 | + wait_for_completion(&recoverd_run); | |
13118 | + | |
13119 | + out: | |
13120 | + return error; | |
13121 | +} | |
13122 | + | |
13123 | +/* | |
13124 | + * Stop the recoverd thread when gdlm is shut down (all lockspaces are gone). | |
13125 | + */ | |
13126 | + | |
13127 | +int recoverd_stop(void) | |
13128 | +{ | |
13129 | + set_bit(THREAD_STOP, &recoverd_flags); | |
13130 | + wake_up(&recoverd_wait); | |
13131 | + wait_for_completion(&recoverd_run); | |
13132 | + | |
13133 | + return 0; | |
13134 | +} | |
13135 | diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h | |
13136 | --- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 13137 | +++ linux-patched/cluster/dlm/recoverd.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 13138 | @@ -0,0 +1,22 @@ |
13139 | +/****************************************************************************** | |
13140 | +******************************************************************************* | |
13141 | +** | |
13142 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
13143 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
13144 | +** | |
13145 | +** This copyrighted material is made available to anyone wishing to use, | |
13146 | +** modify, copy, or redistribute it subject to the terms and conditions | |
13147 | +** of the GNU General Public License v.2. | |
13148 | +** | |
13149 | +******************************************************************************* | |
13150 | +******************************************************************************/ | |
13151 | + | |
13152 | +#ifndef __RECOVERD_DOT_H__ | |
13153 | +#define __RECOVERD_DOT_H__ | |
13154 | + | |
13155 | +void dlm_recoverd_init(void); | |
13156 | +void recoverd_kick(gd_ls_t * ls); | |
13157 | +int recoverd_start(void); | |
13158 | +int recoverd_stop(void); | |
13159 | + | |
13160 | +#endif /* __RECOVERD_DOT_H__ */ | |
13161 | diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c | |
13162 | --- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 13163 | +++ linux-patched/cluster/dlm/rsb.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 13164 | @@ -0,0 +1,307 @@ |
13165 | +/****************************************************************************** | |
13166 | +******************************************************************************* | |
13167 | +** | |
13168 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
13169 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
13170 | +** | |
13171 | +** This copyrighted material is made available to anyone wishing to use, | |
13172 | +** modify, copy, or redistribute it subject to the terms and conditions | |
13173 | +** of the GNU General Public License v.2. | |
13174 | +** | |
13175 | +******************************************************************************* | |
13176 | +******************************************************************************/ | |
13177 | + | |
13178 | +#include "dlm_internal.h" | |
13179 | +#include "locking.h" | |
13180 | +#include "memory.h" | |
13181 | +#include "lockqueue.h" | |
13182 | +#include "nodes.h" | |
13183 | +#include "dir.h" | |
13184 | +#include "util.h" | |
13185 | + | |
13186 | +static gd_res_t *search_hashchain(struct list_head *head, gd_res_t *parent, | |
13187 | + char *name, int namelen) | |
13188 | +{ | |
13189 | + gd_res_t *r; | |
13190 | + | |
13191 | + list_for_each_entry(r, head, res_hashchain) { | |
13192 | + if ((parent == r->res_parent) && (namelen == r->res_length) && | |
13193 | + (memcmp(name, r->res_name, namelen) == 0)) { | |
13194 | + atomic_inc(&r->res_ref); | |
13195 | + return r; | |
13196 | + } | |
13197 | + } | |
13198 | + | |
13199 | + return NULL; | |
13200 | +} | |
13201 | + | |
13202 | +/* | |
13203 | + * A way to arbitrarily hold onto an rsb which we already have a reference to | |
13204 | + * to make sure it doesn't go away. Opposite of release_rsb(). | |
13205 | + */ | |
13206 | + | |
13207 | +void hold_rsb(gd_res_t *r) | |
13208 | +{ | |
13209 | + atomic_inc(&r->res_ref); | |
13210 | +} | |
13211 | + | |
13212 | +/* | |
13213 | + * release_rsb() - Decrement reference count on rsb struct. Free the rsb | |
13214 | + * struct when there are zero references. Every lkb for the rsb adds a | |
13215 | + * reference. When ref is zero there can be no more lkb's for the rsb, on the | |
13216 | + * queue's or anywhere else. | |
13217 | + */ | |
13218 | + | |
13219 | +void release_rsb(gd_res_t *r) | |
13220 | +{ | |
13221 | + gd_ls_t *ls = r->res_ls; | |
13222 | + int removed = FALSE; | |
13223 | + | |
13224 | + write_lock(&ls->ls_reshash_lock); | |
13225 | + atomic_dec(&r->res_ref); | |
13226 | + | |
13227 | + if (!atomic_read(&r->res_ref)) { | |
13228 | + GDLM_ASSERT(list_empty(&r->res_grantqueue),); | |
13229 | + GDLM_ASSERT(list_empty(&r->res_waitqueue),); | |
13230 | + GDLM_ASSERT(list_empty(&r->res_convertqueue),); | |
13231 | + removed = TRUE; | |
13232 | + list_del(&r->res_hashchain); | |
13233 | + } | |
13234 | + write_unlock(&ls->ls_reshash_lock); | |
13235 | + | |
13236 | + if (removed) { | |
13237 | + down_read(&ls->ls_gap_rsblist); | |
13238 | + if (r->res_parent) | |
13239 | + list_del(&r->res_subreslist); | |
13240 | + else | |
13241 | + list_del(&r->res_rootlist); | |
13242 | + up_read(&ls->ls_gap_rsblist); | |
13243 | + | |
13244 | + /* | |
13245 | + * Remove resdir entry if this was a locally mastered root rsb. | |
13246 | + */ | |
13247 | + if (!r->res_parent && !r->res_nodeid) { | |
13248 | + if (get_directory_nodeid(r) != our_nodeid()) | |
13249 | + remote_remove_resdata(r->res_ls, | |
13250 | + get_directory_nodeid(r), | |
13251 | + r->res_name, | |
13252 | + r->res_length, | |
13253 | + r->res_resdir_seq); | |
13254 | + else | |
13255 | + remove_resdata(r->res_ls, our_nodeid(), | |
13256 | + r->res_name, r->res_length, | |
13257 | + r->res_resdir_seq); | |
13258 | + } | |
13259 | + | |
13260 | + if (r->res_lvbptr) | |
13261 | + free_lvb(r->res_lvbptr); | |
13262 | + | |
13263 | + free_rsb(r); | |
13264 | + } | |
13265 | +} | |
13266 | + | |
13267 | +/* | |
13268 | + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist. | |
13269 | + * If the rsb exists, its ref count is incremented by this function. If it | |
13270 | + * doesn't exist, it's created with a ref count of one. | |
13271 | + */ | |
13272 | + | |
13273 | +int find_or_create_rsb(gd_ls_t *ls, gd_res_t *parent, char *name, int namelen, | |
13274 | + int create, gd_res_t **rp) | |
13275 | +{ | |
13276 | + uint32_t hash; | |
13277 | + gd_res_t *r, *tmp; | |
13278 | + int error = -ENOMEM; | |
13279 | + | |
13280 | + GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); | |
13281 | + | |
13282 | + hash = gdlm_hash(name, namelen); | |
13283 | + hash &= ls->ls_hashmask; | |
13284 | + | |
13285 | + read_lock(&ls->ls_reshash_lock); | |
13286 | + r = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen); | |
13287 | + read_unlock(&ls->ls_reshash_lock); | |
13288 | + | |
13289 | + if (r) | |
13290 | + goto out_set; | |
13291 | + if (!create) { | |
13292 | + *rp = NULL; | |
13293 | + goto out; | |
13294 | + } | |
13295 | + | |
13296 | + r = allocate_rsb(ls, namelen); | |
13297 | + if (!r) | |
13298 | + goto fail; | |
13299 | + | |
13300 | + INIT_LIST_HEAD(&r->res_subreslist); | |
13301 | + INIT_LIST_HEAD(&r->res_grantqueue); | |
13302 | + INIT_LIST_HEAD(&r->res_convertqueue); | |
13303 | + INIT_LIST_HEAD(&r->res_waitqueue); | |
13304 | + | |
13305 | + memcpy(r->res_name, name, namelen); | |
13306 | + r->res_length = namelen; | |
13307 | + r->res_ls = ls; | |
13308 | + init_rwsem(&r->res_lock); | |
13309 | + atomic_set(&r->res_ref, 1); | |
13310 | + | |
13311 | + if (parent) { | |
13312 | + r->res_parent = parent; | |
13313 | + r->res_depth = parent->res_depth + 1; | |
13314 | + r->res_root = parent->res_root; | |
13315 | + r->res_nodeid = parent->res_nodeid; | |
13316 | + } else { | |
13317 | + r->res_parent = NULL; | |
13318 | + r->res_depth = 1; | |
13319 | + r->res_root = r; | |
13320 | + r->res_nodeid = -1; | |
13321 | + } | |
13322 | + | |
13323 | + write_lock(&ls->ls_reshash_lock); | |
13324 | + tmp = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen); | |
13325 | + if (tmp) { | |
13326 | + write_unlock(&ls->ls_reshash_lock); | |
13327 | + free_rsb(r); | |
13328 | + r = tmp; | |
13329 | + } else { | |
13330 | + list_add(&r->res_hashchain, &ls->ls_reshashtbl[hash]); | |
13331 | + write_unlock(&ls->ls_reshash_lock); | |
13332 | + | |
13333 | + down_read(&ls->ls_gap_rsblist); | |
13334 | + if (parent) | |
13335 | + list_add_tail(&r->res_subreslist, | |
13336 | + &r->res_root->res_subreslist); | |
13337 | + else | |
13338 | + list_add(&r->res_rootlist, &ls->ls_rootres); | |
13339 | + up_read(&ls->ls_gap_rsblist); | |
13340 | + } | |
13341 | + | |
13342 | + out_set: | |
13343 | + *rp = r; | |
13344 | + | |
13345 | + out: | |
13346 | + error = 0; | |
13347 | + | |
13348 | + fail: | |
13349 | + return error; | |
13350 | +} | |
13351 | + | |
13352 | +/* | |
13353 | + * Add a LKB to a resource's grant/convert/wait queue. in order | |
13354 | + */ | |
13355 | + | |
13356 | +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode) | |
13357 | +{ | |
13358 | + gd_lkb_t *lkb = NULL; | |
13359 | + | |
13360 | + list_for_each_entry(lkb, head, lkb_statequeue) { | |
13361 | + if (lkb->lkb_rqmode < mode) | |
13362 | + break; | |
13363 | + } | |
13364 | + | |
13365 | + if (!lkb) { | |
13366 | + /* No entries in the queue, we are alone */ | |
13367 | + list_add_tail(new, head); | |
13368 | + } else { | |
13369 | + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); | |
13370 | + } | |
13371 | +} | |
13372 | + | |
13373 | +/* | |
13374 | + * The rsb res_lock must be held in write when this function is called. | |
13375 | + */ | |
13376 | + | |
13377 | +void lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type) | |
13378 | +{ | |
13379 | + | |
13380 | + GDLM_ASSERT(!lkb->lkb_status, printk("status=%u\n", lkb->lkb_status);); | |
13381 | + | |
13382 | + lkb->lkb_status = type; | |
13383 | + | |
13384 | + switch (type) { | |
13385 | + case GDLM_LKSTS_WAITING: | |
13386 | + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue); | |
13387 | + break; | |
13388 | + | |
13389 | + case GDLM_LKSTS_GRANTED: | |
13390 | + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue, | |
13391 | + lkb->lkb_grmode); | |
13392 | + break; | |
13393 | + | |
13394 | + case GDLM_LKSTS_CONVERT: | |
13395 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE) | |
13396 | + list_add(&lkb->lkb_statequeue, &r->res_convertqueue); | |
13397 | + | |
13398 | + else | |
13399 | + if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT) | |
13400 | + list_add_tail(&lkb->lkb_statequeue, | |
13401 | + &r->res_convertqueue); | |
13402 | + else | |
13403 | + lkb_add_ordered(&lkb->lkb_statequeue, | |
13404 | + &r->res_convertqueue, lkb->lkb_rqmode); | |
13405 | + break; | |
13406 | + | |
13407 | + default: | |
13408 | + GDLM_ASSERT(0,); | |
13409 | + } | |
13410 | +} | |
13411 | + | |
13412 | +void res_lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type) | |
13413 | +{ | |
13414 | + down_write(&r->res_lock); | |
13415 | + lkb_enqueue(r, lkb, type); | |
13416 | + up_write(&r->res_lock); | |
13417 | +} | |
13418 | + | |
13419 | +/* | |
13420 | + * The rsb res_lock must be held in write when this function is called. | |
13421 | + */ | |
13422 | + | |
13423 | +int lkb_dequeue(gd_lkb_t *lkb) | |
13424 | +{ | |
13425 | + int status = lkb->lkb_status; | |
13426 | + | |
13427 | + if (!status) | |
13428 | + goto out; | |
13429 | + | |
13430 | + lkb->lkb_status = 0; | |
13431 | + list_del(&lkb->lkb_statequeue); | |
13432 | + | |
13433 | + out: | |
13434 | + return status; | |
13435 | +} | |
13436 | + | |
13437 | +int res_lkb_dequeue(gd_lkb_t *lkb) | |
13438 | +{ | |
13439 | + int status; | |
13440 | + | |
13441 | + down_write(&lkb->lkb_resource->res_lock); | |
13442 | + status = lkb_dequeue(lkb); | |
13443 | + up_write(&lkb->lkb_resource->res_lock); | |
13444 | + | |
13445 | + return status; | |
13446 | +} | |
13447 | + | |
13448 | +/* | |
13449 | + * The rsb res_lock must be held in write when this function is called. | |
13450 | + */ | |
13451 | + | |
13452 | +int lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type) | |
13453 | +{ | |
13454 | + int status; | |
13455 | + | |
13456 | + status = lkb_dequeue(lkb); | |
13457 | + lkb_enqueue(r, lkb, type); | |
13458 | + | |
13459 | + return status; | |
13460 | +} | |
13461 | + | |
13462 | +int res_lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type) | |
13463 | +{ | |
13464 | + int status; | |
13465 | + | |
13466 | + down_write(&r->res_lock); | |
13467 | + status = lkb_swqueue(r, lkb, type); | |
13468 | + up_write(&r->res_lock); | |
13469 | + | |
13470 | + return status; | |
13471 | +} | |
13472 | diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h | |
13473 | --- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 13474 | +++ linux-patched/cluster/dlm/rsb.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 13475 | @@ -0,0 +1,30 @@ |
13476 | +/****************************************************************************** | |
13477 | +******************************************************************************* | |
13478 | +** | |
13479 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
13480 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
13481 | +** | |
13482 | +** This copyrighted material is made available to anyone wishing to use, | |
13483 | +** modify, copy, or redistribute it subject to the terms and conditions | |
13484 | +** of the GNU General Public License v.2. | |
13485 | +** | |
13486 | +******************************************************************************* | |
13487 | +******************************************************************************/ | |
13488 | + | |
13489 | +#ifndef __RSB_DOT_H__ | |
13490 | +#define __RSB_DOT_H__ | |
13491 | + | |
13492 | +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode); | |
13493 | +void _release_rsb(gd_res_t * r); | |
13494 | +void release_rsb(gd_res_t * r); | |
13495 | +void hold_rsb(gd_res_t * r); | |
13496 | +int find_or_create_rsb(gd_ls_t * ls, gd_res_t * parent, char *name, int namelen, | |
13497 | + int create, gd_res_t ** rp); | |
13498 | +void lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type); | |
13499 | +void res_lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type); | |
13500 | +int lkb_dequeue(gd_lkb_t * lkb); | |
13501 | +int res_lkb_dequeue(gd_lkb_t * lkb); | |
13502 | +int lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type); | |
13503 | +int res_lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type); | |
13504 | + | |
13505 | +#endif /* __RSB_DOT_H__ */ | |
13506 | diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c | |
13507 | --- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 13508 | +++ linux-patched/cluster/dlm/util.c 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 13509 | @@ -0,0 +1,130 @@ |
13510 | +/****************************************************************************** | |
13511 | +******************************************************************************* | |
13512 | +** | |
13513 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
13514 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
13515 | +** | |
13516 | +** This copyrighted material is made available to anyone wishing to use, | |
13517 | +** modify, copy, or redistribute it subject to the terms and conditions | |
13518 | +** of the GNU General Public License v.2. | |
13519 | +** | |
13520 | +******************************************************************************* | |
13521 | +******************************************************************************/ | |
13522 | + | |
13523 | +#include "dlm_internal.h" | |
13524 | + | |
13525 | +static const uint32_t crc_32_tab[] = { | |
13526 | + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, | |
13527 | + 0xe963a535, 0x9e6495a3, | |
13528 | + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, | |
13529 | + 0xe7b82d07, 0x90bf1d91, | |
13530 | + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, | |
13531 | + 0xf4d4b551, 0x83d385c7, | |
13532 | + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, | |
13533 | + 0xfa0f3d63, 0x8d080df5, | |
13534 | + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, | |
13535 | + 0xd20d85fd, 0xa50ab56b, | |
13536 | + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, | |
13537 | + 0xdcd60dcf, 0xabd13d59, | |
13538 | + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, | |
13539 | + 0xcfba9599, 0xb8bda50f, | |
13540 | + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, | |
13541 | + 0xc1611dab, 0xb6662d3d, | |
13542 | + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, | |
13543 | + 0x9fbfe4a5, 0xe8b8d433, | |
13544 | + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, | |
13545 | + 0x91646c97, 0xe6635c01, | |
13546 | + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, | |
13547 | + 0x8208f4c1, 0xf50fc457, | |
13548 | + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, | |
13549 | + 0x8cd37cf3, 0xfbd44c65, | |
13550 | + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, | |
13551 | + 0xa4d1c46d, 0xd3d6f4fb, | |
13552 | + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, | |
13553 | + 0xaa0a4c5f, 0xdd0d7cc9, | |
13554 | + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, | |
13555 | + 0xb966d409, 0xce61e49f, | |
13556 | + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, | |
13557 | + 0xb7bd5c3b, 0xc0ba6cad, | |
13558 | + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, | |
13559 | + 0x04db2615, 0x73dc1683, | |
13560 | + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, | |
13561 | + 0x0a00ae27, 0x7d079eb1, | |
13562 | + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, | |
13563 | + 0x196c3671, 0x6e6b06e7, | |
13564 | + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, | |
13565 | + 0x17b7be43, 0x60b08ed5, | |
13566 | + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, | |
13567 | + 0x3fb506dd, 0x48b2364b, | |
13568 | + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, | |
13569 | + 0x316e8eef, 0x4669be79, | |
13570 | + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, | |
13571 | + 0x220216b9, 0x5505262f, | |
13572 | + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, | |
13573 | + 0x2cd99e8b, 0x5bdeae1d, | |
13574 | + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, | |
13575 | + 0x72076785, 0x05005713, | |
13576 | + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, | |
13577 | + 0x7cdcefb7, 0x0bdbdf21, | |
13578 | + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, | |
13579 | + 0x6fb077e1, 0x18b74777, | |
13580 | + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, | |
13581 | + 0x616bffd3, 0x166ccf45, | |
13582 | + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, | |
13583 | + 0x4969474d, 0x3e6e77db, | |
13584 | + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, | |
13585 | + 0x47b2cf7f, 0x30b5ffe9, | |
13586 | + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, | |
13587 | + 0x54de5729, 0x23d967bf, | |
13588 | + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, | |
13589 | + 0x5a05df1b, 0x2d02ef8d | |
13590 | +}; | |
13591 | + | |
13592 | +/** | |
13593 | + * gdlm_hash - hash an array of data | |
13594 | + * @data: the data to be hashed | |
13595 | + * @len: the length of data to be hashed | |
13596 | + * | |
13597 | + * Copied from GFS. | |
13598 | + * | |
13599 | + * Take some data and convert it to a 32-bit hash. | |
13600 | + * | |
13601 | + * The hash function is a 32-bit CRC of the data. The algorithm uses | |
13602 | + * the crc_32_tab table above. | |
13603 | + * | |
13604 | + * This may not be the fastest hash function, but it does a fair bit better | |
13605 | + * at providing uniform results than the others I've looked at. That's | |
13606 | + * really important for efficient directories. | |
13607 | + * | |
13608 | + * Returns: the hash | |
13609 | + */ | |
13610 | + | |
13611 | +uint32_t gdlm_hash(const char *data, int len) | |
13612 | +{ | |
13613 | + uint32_t hash = 0xFFFFFFFF; | |
13614 | + | |
13615 | + for (; len--; data++) | |
13616 | + hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8); | |
13617 | + | |
13618 | + hash = ~hash; | |
13619 | + | |
13620 | + return hash; | |
13621 | +} | |
13622 | + | |
13623 | +uint32_t gdlm_next_power2(uint32_t val) | |
13624 | +{ | |
13625 | + uint32_t x; | |
13626 | + | |
13627 | + for (x = 1; x < val; x <<= 1) ; | |
13628 | + | |
13629 | + return x; | |
13630 | +} | |
13631 | + | |
13632 | +void print_lkb(gd_lkb_t *lkb) | |
13633 | +{ | |
13634 | + printk("dlm: lkb id=%x remid=%x flags=%x status=%x rq=%d gr=%d " | |
13635 | + "nodeid=%u lqstate=%x lqflags=%x\n", | |
13636 | + lkb->lkb_id, lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_status, | |
13637 | + lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_nodeid, | |
13638 | + lkb->lkb_lockqueue_state, lkb->lkb_lockqueue_flags); | |
13639 | +} | |
13640 | diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h | |
13641 | --- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 13642 | +++ linux-patched/cluster/dlm/util.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 13643 | @@ -0,0 +1,22 @@ |
13644 | +/****************************************************************************** | |
13645 | +******************************************************************************* | |
13646 | +** | |
13647 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
13648 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
13649 | +** | |
13650 | +** This copyrighted material is made available to anyone wishing to use, | |
13651 | +** modify, copy, or redistribute it subject to the terms and conditions | |
13652 | +** of the GNU General Public License v.2. | |
13653 | +** | |
13654 | +******************************************************************************* | |
13655 | +******************************************************************************/ | |
13656 | + | |
13657 | +#ifndef __UTIL_DOT_H__ | |
13658 | +#define __UTIL_DOT_H__ | |
13659 | + | |
13660 | +uint32_t gdlm_hash(const char *data, int len); | |
13661 | +uint32_t gdlm_next_power2(uint32_t val); | |
13662 | + | |
13663 | +void print_lkb(gd_lkb_t *lkb); | |
13664 | + | |
13665 | +#endif | |
13666 | diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h | |
13667 | --- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 13668 | +++ linux-patched/include/cluster/dlm.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 13669 | @@ -0,0 +1,404 @@ |
13670 | +/****************************************************************************** | |
13671 | +******************************************************************************* | |
13672 | +** | |
13673 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
13674 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
13675 | +** | |
13676 | +** This copyrighted material is made available to anyone wishing to use, | |
13677 | +** modify, copy, or redistribute it subject to the terms and conditions | |
13678 | +** of the GNU General Public License v.2. | |
13679 | +** | |
13680 | +******************************************************************************* | |
13681 | +******************************************************************************/ | |
13682 | + | |
13683 | +#ifndef __DLM_DOT_H__ | |
13684 | +#define __DLM_DOT_H__ | |
13685 | + | |
13686 | +/* | |
13687 | + * Interface to DLM - routines and structures to use DLM lockspaces. | |
13688 | + */ | |
13689 | + | |
13690 | +/* | |
13691 | + * Lock Modes | |
13692 | + */ | |
13693 | + | |
13694 | +#define DLM_LOCK_IV (-1) /* invalid */ | |
13695 | +#define DLM_LOCK_NL (0) /* null */ | |
13696 | +#define DLM_LOCK_CR (1) /* concurrent read */ | |
13697 | +#define DLM_LOCK_CW (2) /* concurrent write */ | |
13698 | +#define DLM_LOCK_PR (3) /* protected read */ | |
13699 | +#define DLM_LOCK_PW (4) /* protected write */ | |
13700 | +#define DLM_LOCK_EX (5) /* exclusive */ | |
13701 | + | |
13702 | +/* | |
13703 | + * Maximum size in bytes of a dlm_lock name | |
13704 | + */ | |
13705 | + | |
13706 | +#define DLM_RESNAME_MAXLEN (64) | |
13707 | + | |
13708 | +/* | |
13709 | + * Size in bytes of Lock Value Block | |
13710 | + */ | |
13711 | + | |
13712 | +#define DLM_LVB_LEN (32) | |
13713 | + | |
13714 | +/* | |
13715 | + * Flags to dlm_new_lockspace | |
13716 | + * | |
13717 | + * DLM_LSF_NOTIMERS | |
13718 | + * | |
13719 | + * Do not subject locks in this lockspace to time-outs. | |
13720 | + * | |
13721 | + */ | |
13722 | + | |
13723 | +#define DLM_LSF_NOTIMERS (1) | |
13724 | + | |
13725 | +/* | |
13726 | + * Flags to dlm_lock | |
13727 | + * | |
13728 | + * DLM_LKF_NOQUEUE | |
13729 | + * | |
13730 | + * Do not queue the lock request on the wait queue if it cannot be granted | |
13731 | + * immediately. If the lock cannot be granted because of this flag, DLM will | |
13732 | + * either return -EAGAIN from the dlm_lock call or will return 0 from | |
13733 | + * dlm_lock and -EAGAIN in the lock status block when the AST is executed. | |
13734 | + * | |
13735 | + * DLM_LKF_CONVERT | |
13736 | + * | |
13737 | + * Indicates a lock conversion request. For conversions the name and namelen | |
13738 | + * are ignored and the lock ID in the LKSB is used to identify the lock. | |
13739 | + * | |
13740 | + * DLM_LKF_VALBLK | |
13741 | + * | |
13742 | + * Requests DLM to return the current contents of the lock value block in the | |
13743 | + * lock status block. When this flag is set in a lock conversion from PW or EX | |
13744 | + * modes, DLM assigns the value specified in the lock status block to the lock | |
13745 | + * value block of the lock resource. The LVB is a DLM_LVB_LEN size array | |
13746 | + * containing application-specific information. | |
13747 | + * | |
13748 | + * DLM_LKF_QUECVT | |
13749 | + * | |
13750 | + * Force a conversion lock request to the back of the convert queue. All other | |
13751 | + * conversion requests ahead of it must be granted before it can be granted. | |
13752 | + * This enforces a FIFO ordering on the convert queue. When this flag is set, | |
13753 | + * indefinite postponement is averted. This flag is allowed only when | |
13754 | + * converting a lock to a more restrictive mode. | |
13755 | + * | |
13756 | + * DLM_LKF_CANCEL | |
13757 | + * | |
13758 | + * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to | |
13759 | + * previously granted mode. | |
13760 | + * | |
13761 | + * DLM_LKF_IVVALBLK | |
13762 | + * | |
13763 | + * Invalidate/clear the lock value block. | |
13764 | + * | |
13765 | + * DLM_LKF_CONVDEADLK | |
13766 | + * | |
13767 | + * The granted mode of a lock being converted (from a non-NL mode) can be | |
13768 | + * changed to NL in the process of acquiring the requested mode to avoid | |
13769 | + * conversion deadlock. | |
13770 | + * | |
13771 | + * DLM_LKF_PERSISTENT | |
13772 | + * | |
13773 | + * Only relevant to locks originating in userspace. Signals to the ioctl.c code | |
13774 | + * that this lock should not be unlocked when the process exits. | |
13775 | + * | |
13776 | + * DLM_LKF_NODLKWT | |
13777 | + * | |
13778 | + * This lock is not to be checked for conversion deadlocks. | |
13779 | + * | |
13780 | + * DLM_LKF_NODLCKBLK | |
13781 | + * | |
13782 | + * not yet implemented | |
13783 | + * | |
13784 | + * DLM_LKF_EXPEDITE | |
13785 | + * | |
13786 | + * If this lock conversion cannot be granted immediately it is to go to the | |
13787 | + * head of the conversion queue regardless of its requested lock mode. | |
13788 | + * | |
13789 | + * DLM_LKF_NOQUEUEBAST | |
13790 | + * | |
13791 | + * Send blocking AST's before returning -EAGAIN to the caller. It is only | |
13792 | + * used along with the NOQUEUE flag. Blocking AST's are not sent for failed | |
13793 | + * NOQUEUE requests otherwise. | |
13794 | + * | |
13795 | + */ | |
13796 | + | |
13797 | +#define DLM_LKF_NOQUEUE (0x00000001) | |
13798 | +#define DLM_LKF_CANCEL (0x00000002) | |
13799 | +#define DLM_LKF_CONVERT (0x00000004) | |
13800 | +#define DLM_LKF_VALBLK (0x00000008) | |
13801 | +#define DLM_LKF_QUECVT (0x00000010) | |
13802 | +#define DLM_LKF_IVVALBLK (0x00000020) | |
13803 | +#define DLM_LKF_CONVDEADLK (0x00000040) | |
13804 | +#define DLM_LKF_PERSISTENT (0x00000080) | |
13805 | +#define DLM_LKF_NODLCKWT (0x00000100) | |
13806 | +#define DLM_LKF_NODLCKBLK (0x00000200) | |
13807 | +#define DLM_LKF_EXPEDITE (0x00000400) | |
13808 | +#define DLM_LKF_NOQUEUEBAST (0x00000800) | |
13809 | + | |
13810 | +/* | |
13811 | + * Some return codes that are not not in errno.h | |
13812 | + */ | |
13813 | + | |
13814 | +#define DLM_ECANCEL (0x10001) | |
13815 | +#define DLM_EUNLOCK (0x10002) | |
13816 | + | |
13817 | +typedef void dlm_lockspace_t; | |
13818 | + | |
13819 | +/* | |
13820 | + * Lock range structure | |
13821 | + */ | |
13822 | + | |
13823 | +struct dlm_range { | |
13824 | + uint64_t ra_start; | |
13825 | + uint64_t ra_end; | |
13826 | +}; | |
13827 | + | |
13828 | +/* | |
13829 | + * Lock status block | |
13830 | + * | |
13831 | + * Use this structure to specify the contents of the lock value block. For a | |
13832 | + * conversion request, this structure is used to specify the lock ID of the | |
13833 | + * lock. DLM writes the status of the lock request and the lock ID assigned | |
13834 | + * to the request in the lock status block. | |
13835 | + * | |
13836 | + * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests. | |
13837 | + * It is available when dlm_lock returns. | |
13838 | + * | |
13839 | + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules | |
13840 | + * shown for the DLM_LKF_VALBLK flag. | |
13841 | + * | |
13842 | + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock, | |
13843 | + * it was first demoted to NL to avoid conversion deadlock. | |
13844 | + * | |
13845 | + * sb_status: the returned status of the lock request set prior to AST | |
13846 | + * execution. Possible return values: | |
13847 | + * | |
13848 | + * 0 if lock request was successful | |
13849 | + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE | |
13850 | + * -ENOMEM if there is no memory to process request | |
13851 | + * -EINVAL if there are invalid parameters | |
13852 | + * -DLM_EUNLOCK if unlock request was successful | |
13853 | + * -DLM_ECANCEL ? | |
13854 | + */ | |
13855 | + | |
13856 | +#define DLM_SBF_DEMOTED (0x01) | |
13857 | + | |
13858 | +struct dlm_lksb { | |
13859 | + int sb_status; | |
13860 | + uint32_t sb_lkid; | |
13861 | + char sb_flags; | |
13862 | + char * sb_lvbptr; | |
13863 | +}; | |
13864 | + | |
13865 | +/* | |
13866 | + * These defines are the bits that make up the | |
13867 | + * query code. | |
13868 | + */ | |
13869 | + | |
13870 | +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in | |
13871 | + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */ | |
13872 | +#define DLM_LOCK_THIS 0x0007 | |
13873 | +#define DLM_QUERY_MODE_MASK 0x0007 | |
13874 | + | |
13875 | +/* Bits 3, 4, 5 bitmap of queue(s) to query */ | |
13876 | +#define DLM_QUERY_QUEUE_WAIT 0x0008 | |
13877 | +#define DLM_QUERY_QUEUE_CONVERT 0x0010 | |
13878 | +#define DLM_QUERY_QUEUE_GRANT 0x0020 | |
13879 | +#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */ | |
13880 | +#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */ | |
13881 | + | |
13882 | +/* Bit 6, Return only the information that can be established without a network | |
13883 | + * round-trip. The caller must be aware of the implications of this. Useful for | |
13884 | + * just getting the master node id or resource name. */ | |
13885 | +#define DLM_QUERY_LOCAL 0x0040 | |
13886 | + | |
13887 | +/* Bits 8 up, query type */ | |
13888 | +#define DLM_QUERY_LOCKS_HIGHER 0x0100 | |
13889 | +#define DLM_QUERY_LOCKS_LOWER 0x0200 | |
13890 | +#define DLM_QUERY_LOCKS_EQUAL 0x0300 | |
13891 | +#define DLM_QUERY_LOCKS_BLOCKING 0x0400 | |
13892 | +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500 | |
13893 | +#define DLM_QUERY_LOCKS_ALL 0x0600 | |
13894 | +#define DLM_QUERY_MASK 0x0F00 | |
13895 | + | |
13896 | +/* GRMODE is the default for mode comparisons, | |
13897 | + RQMODE might also be handy */ | |
13898 | +#define DLM_QUERY_GRMODE 0x0000 | |
13899 | +#define DLM_QUERY_RQMODE 0x1000 | |
13900 | + | |
13901 | +/* Structures passed into and out of the query */ | |
13902 | + | |
13903 | +struct dlm_lockinfo { | |
13904 | + int lki_lkid; /* Lock ID on originating node */ | |
13905 | + int lki_mstlkid; /* Lock ID on master node */ | |
13906 | + int lki_parent; | |
13907 | + int lki_node; /* Originating node (not master) */ | |
13908 | + uint8_t lki_state; /* Queue the lock is on */ | |
13909 | + uint8_t lki_grmode; /* Granted mode */ | |
13910 | + uint8_t lki_rqmode; /* Requested mode */ | |
13911 | + struct dlm_range lki_grrange; /* Granted range, if applicable */ | |
13912 | + struct dlm_range lki_rqrange; /* Requested range, if applicable */ | |
13913 | +}; | |
13914 | + | |
13915 | +struct dlm_resinfo { | |
13916 | + int rsi_length; | |
13917 | + int rsi_grantcount; /* No. of nodes on grant queue */ | |
13918 | + int rsi_convcount; /* No. of nodes on convert queue */ | |
13919 | + int rsi_waitcount; /* No. of nodes on wait queue */ | |
13920 | + int rsi_masternode; /* Master for this resource */ | |
13921 | + char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */ | |
13922 | + char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable | |
13923 | + */ | |
13924 | +}; | |
13925 | + | |
13926 | +struct dlm_queryinfo { | |
13927 | + struct dlm_resinfo *gqi_resinfo; | |
13928 | + struct dlm_lockinfo *gqi_lockinfo; /* This points to an array | |
13929 | + * of structs */ | |
13930 | + int gqi_locksize; /* input */ | |
13931 | + int gqi_lockcount; /* output */ | |
13932 | +}; | |
13933 | + | |
13934 | +#ifdef __KERNEL__ | |
13935 | +/* | |
13936 | + * dlm_init | |
13937 | + * | |
13938 | + * Starts and initializes DLM threads and structures. Creation of the first | |
13939 | + * lockspace will call this if it has not been called already. | |
13940 | + * | |
13941 | + * Returns: 0 if successful, -EXXX on error | |
13942 | + */ | |
13943 | + | |
13944 | +int dlm_init(void); | |
13945 | + | |
13946 | +/* | |
13947 | + * dlm_release | |
13948 | + * | |
13949 | + * Stops DLM threads. | |
13950 | + * | |
13951 | + * Returns: 0 if successful, -EXXX on error | |
13952 | + */ | |
13953 | + | |
13954 | +int dlm_release(void); | |
13955 | + | |
13956 | +/* | |
13957 | + * dlm_new_lockspace | |
13958 | + * | |
13959 | + * Starts a lockspace with the given name. If the named lockspace exists in | |
13960 | + * the cluster, the calling node joins it. | |
13961 | + */ | |
13962 | + | |
13963 | +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace, | |
13964 | + int flags); | |
13965 | + | |
13966 | +/* | |
13967 | + * dlm_release_lockspace | |
13968 | + * | |
13969 | + * Stop a lockspace. | |
13970 | + */ | |
13971 | + | |
13972 | +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); | |
13973 | + | |
13974 | +/* | |
13975 | + * dlm_lock | |
13976 | + * | |
13977 | + * Make an asyncronous request to acquire or convert a lock on a named | |
13978 | + * resource. | |
13979 | + * | |
13980 | + * lockspace: context for the request | |
13981 | + * mode: the requested mode of the lock (DLM_LOCK_) | |
13982 | + * lksb: lock status block for input and async return values | |
13983 | + * flags: input flags (DLM_LKF_) | |
13984 | + * name: name of the resource to lock, can be binary | |
13985 | + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN) | |
13986 | + * parent: the lock ID of a parent lock or 0 if none | |
13987 | + * lockast: function DLM executes when it completes processing the request | |
13988 | + * astarg: argument passed to lockast and bast functions | |
13989 | + * bast: function DLM executes when this lock later blocks another request | |
13990 | + * | |
13991 | + * Returns: | |
13992 | + * 0 if request is successfully queued for processing | |
13993 | + * -EINVAL if any input parameters are invalid | |
13994 | + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE | |
13995 | + * -ENOMEM if there is no memory to process request | |
13996 | + * -ENOTCONN if there is a communication error | |
13997 | + * | |
13998 | + * If the call to dlm_lock returns an error then the operation has failed and | |
13999 | + * the AST routine will not be called. If dlm_lock returns 0 it is still | |
14000 | + * possible that the lock operation will fail. The AST routine will be called | |
14001 | + * when the locking is complete and the status is returned in the lksb. | |
14002 | + * | |
14003 | + * If the AST routines or parameter are passed to a conversion operation then | |
14004 | + * they will overwrite those values that were passed to a previous dlm_lock | |
14005 | + * call. | |
14006 | + * | |
14007 | + * AST routines should not block (at least not for long), but may make | |
14008 | + * any locking calls they please. | |
14009 | + */ | |
14010 | + | |
14011 | +int dlm_lock(dlm_lockspace_t *lockspace, | |
14012 | + uint32_t mode, | |
14013 | + struct dlm_lksb *lksb, | |
14014 | + uint32_t flags, | |
14015 | + void *name, | |
14016 | + unsigned int namelen, | |
14017 | + uint32_t parent, | |
14018 | + void (*lockast) (void *astarg), | |
14019 | + void *astarg, | |
14020 | + void (*bast) (void *astarg, int mode), | |
14021 | + struct dlm_range *range); | |
14022 | + | |
14023 | +/* | |
14024 | + * dlm_unlock | |
14025 | + * | |
14026 | + * Asynchronously release a lock on a resource. The AST routine is called | |
14027 | + * when the resource is successfully unlocked. | |
14028 | + * | |
14029 | + * lockspace: context for the request | |
14030 | + * lkid: the lock ID as returned in the lksb | |
14031 | + * flags: input flags (DLM_LKF_) | |
14032 | + * lksb: if NULL the lksb parameter passed to last lock request is used | |
14033 | + * astarg: if NULL, astarg in last lock request is used | |
14034 | + * | |
14035 | + * Returns: | |
14036 | + * 0 if request is successfully queued for processing | |
14037 | + * -EINVAL if any input parameters are invalid | |
14038 | + * -ENOTEMPTY if the lock still has sublocks | |
14039 | + * -EBUSY if the lock is waiting for a remote lock operation | |
14040 | + * -ENOTCONN if there is a communication error | |
14041 | + */ | |
14042 | + | |
14043 | +extern int dlm_unlock(dlm_lockspace_t *lockspace, | |
14044 | + uint32_t lkid, | |
14045 | + uint32_t flags, | |
14046 | + struct dlm_lksb *lksb, | |
14047 | + void *astarg); | |
14048 | + | |
14049 | +/* Query interface | |
14050 | + * | |
14051 | + * Query the other holders of a resource, given a known lock ID | |
14052 | + * | |
14053 | + * lockspace: context for the request | |
14054 | + * lksb: LKSB, sb_lkid contains the lock ID of a valid lock | |
14055 | + * on the resource. sb_status will contain the status | |
14056 | + * of the request on completion. | |
14057 | + * query: query bitmap see DLM_QUERY_* above | |
14058 | + * qinfo: pointer to dlm_queryinfo structure | |
14059 | + * ast_routine: AST routine to call on completion | |
14060 | + * artarg: argument to AST routine. It is "traditional" | |
14061 | + * to put the qinfo pointer into lksb->sb_lvbptr | |
14062 | + * and pass the lksb in here. | |
14063 | + */ | |
14064 | +extern int dlm_query(dlm_lockspace_t *lockspace, | |
14065 | + struct dlm_lksb *lksb, | |
14066 | + int query, | |
14067 | + struct dlm_queryinfo *qinfo, | |
14068 | + void (ast_routine(void *)), | |
14069 | + void *astarg); | |
14070 | + | |
14071 | +#endif /* __KERNEL__ */ | |
14072 | + | |
14073 | +#endif /* __DLM_DOT_H__ */ | |
14074 | diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h | |
14075 | --- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730 | |
5cdbd17b | 14076 | +++ linux-patched/include/cluster/dlm_device.h 2004-06-29 20:01:20.000000000 +0800 |
4bf12011 | 14077 | @@ -0,0 +1,63 @@ |
14078 | +/****************************************************************************** | |
14079 | +******************************************************************************* | |
14080 | +** | |
14081 | +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | |
14082 | +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. | |
14083 | +** | |
14084 | +** This copyrighted material is made available to anyone wishing to use, | |
14085 | +** modify, copy, or redistribute it subject to the terms and conditions | |
14086 | +** of the GNU General Public License v.2. | |
14087 | +** | |
14088 | +******************************************************************************* | |
14089 | +******************************************************************************/ | |
14090 | + | |
14091 | +/* This is the device interface for dlm, most users will use a library | |
14092 | + * interface. | |
14093 | + */ | |
14094 | + | |
14095 | +/* Version of the device interface */ | |
14096 | +#define DLM_DEVICE_VERSION_MAJOR 2 | |
14097 | +#define DLM_DEVICE_VERSION_MINOR 0 | |
14098 | +#define DLM_DEVICE_VERSION_PATCH 0 | |
14099 | + | |
14100 | +/* struct passed to the lock write */ | |
14101 | +struct dlm_lock_params { | |
14102 | + uint32_t version[3]; | |
14103 | + uint8_t cmd; | |
14104 | + uint8_t mode; | |
14105 | + uint16_t flags; | |
14106 | + uint32_t lkid; | |
14107 | + uint32_t parent; | |
14108 | + struct dlm_range range; | |
14109 | + uint8_t namelen; | |
14110 | + void *astparam; | |
14111 | + void *astaddr; | |
14112 | + void *bastaddr; | |
14113 | + struct dlm_lksb *lksb; | |
14114 | + char name[1]; | |
14115 | +}; | |
14116 | + | |
14117 | + | |
14118 | +/* struct read from the "device" fd, | |
14119 | + consists mainly of userspace pointers for the library to use */ | |
14120 | +struct dlm_lock_result { | |
14121 | + uint8_t cmd; | |
14122 | + void *astparam; | |
14123 | + void (*astaddr)(void *astparam); | |
14124 | + struct dlm_lksb *user_lksb; | |
14125 | + struct dlm_lksb lksb; /* But this has real data in it */ | |
14126 | + uint8_t bast_mode; /* Not yet used */ | |
14127 | +}; | |
14128 | + | |
14129 | +/* commands passed to the device */ | |
14130 | +#define DLM_USER_LOCK 1 | |
14131 | +#define DLM_USER_UNLOCK 2 | |
14132 | +#define DLM_USER_QUERY 3 | |
14133 | + | |
14134 | +/* Arbitrary length restriction */ | |
14135 | +#define MAX_LS_NAME_LEN 64 | |
14136 | + | |
14137 | +/* ioctls on the device */ | |
14138 | +#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *) | |
14139 | +#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *) | |
14140 | +#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *) |