--- linux-2.5.64/drivers/block/Kconfig.pre-enbd Mon Mar 24 15:56:52 2003 +++ linux-2.5.64/drivers/block/Kconfig Mon Mar 24 17:45:35 2003 @@ -292,6 +292,15 @@ If unsure, say N. +config ENBD + bool 'Enhanced network block device' + depends on NET + ---help--- + To use the ENBD support, you must say Y here and select one + of the driver's units (e.g. BLK_DEV_ENBD, BLK_DEV_ENBD_IOCTL). + +source "drivers/block/enbd/Kconfig" + config BLK_DEV_RAM tristate "RAM disk support" ---help--- --- linux-2.5.64/drivers/block/enbd/enbd_base.c.pre-enbd Mon Mar 24 18:55:25 2003 +++ linux-2.5.64/drivers/block/enbd/enbd_base.c Tue Mar 25 15:44:12 2003 @@ -0,0 +1,4172 @@ +/* + * (Enhanced) Network block device - make block devices work over TCP + * + * Original NBD Copyright 1997 Pavel Machek + * Further ENBD Copyrights 1998, 1999, 2000 Peter Breuer + * + * + * + * ATTENTION: You need the userspace daemons available from + * ftp://oboe.it.uc3m.es/pub/Programs/nbd-2.4.*.tgz + * and/or the ENBD project on http://freshmeat.net + * + * + * + * Development of the ENBD software has been supported by grants and + * contributions from Realm Information Technologies, Inc. of 5555 + * Oakbrook Parkway, NW Norcross, GA and iNsu Innovations Inc. of + * 3465, Boulevard Thimens, Saint-Laurent, Quebec, Canada. + * + * ------------ Pavel's history notes ---------------------------------- + * 97-3-25 compiled 0-th version, not yet tested it + * (it did not work, BTW) (later that day) HEY! it works! + * (bit later) hmm, not that much... 2:00am next day: + * yes, it works, but it gives something like 50kB/sec + * 97-3-28 it's completely strange - when using 1024 byte "packets" + * it gives 50kB/sec and CPU idle; with 2048 bytes it gives + * 500kB/sec (and CPU loaded 100% as it should be) (all done + * against localhost) + * 97-4-1 complete rewrite to make it possible for many requests at + * once to be processed + * 97-4-1 23:57 rewrite once again to make it work :-( + * 97-4-3 00:02 hmm, it does not work. + * 97-4-3 23:06 hmm, it will need one more rewrite :-) + * 97-4-10 It looks like it's working and stable. But I still do not + * have any recovery from lost connection... + * (setq tab-width 4) + * 97-4-11 Making protocol independent of endianity etc. + * 97-4-15 Probably one more rewrite, since it loses requests under + * heavy loads + * 97-9-13 Cosmetic changes + * + * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall + * why not: would need verify_area and friends, would share yet another + * structure with userland + * + * FIXME: not module-safe + * + * ------------ Peter's history notes ---------------------------------- + * 98-12-18 modules now OK ptb@it.uc3m.es (Peter Breuer) ported to + * 2.0.*. + better debugging. Still possible lockup in connection with APM + * and spurious interrupt - only on write. Error treatment should + * be improved. After 100 errors from end_request the kernel can + * do anything. We should catch it ourselves. + * 99-1-sometime fixed lockup by extending semaphore - ptb v1.0. + * 99-3-sometime reconnect protocol (client mod agreed by pavel) - ptb v1.1 + * 99-4-25 add /proc/nbdinfo - ptb v1.1.1 + * 99-4-sometime add multiplex - ptb v1.2 + * 99-4-26 fix multiplex and redundancy - ptb v1.2.1 + * 99-4-29 reentrant client threads - ptb v1.2.2 + * 99-4-29 socket related stuff placed in user space - amarin v1.3.0 + * 99-5-3 fix all, all writes had to be before all reads - ptb v1.2.4 + * 99-5-5 fix out-of-order, async - ptb v1.2.5 + * 99-5-7 semaphores removed (still works!), fail cases corrected - ptb v1.2.6 + * 99-5-12 signals unblocked in xmit, blksize != 1024 fixed, ioctls + * added - ptb v1.2.7 + * 99-6-1 interaction with client split into two functions - amarin v1.3.0 + * 99-6-3 reintegrated fully, mem manager fixed, accounts fixed - ptb v1.2.8.3 + * 99-6-3 extra queue removed, mem manager removed - ptb v1.2.8.4 + * 99-7-3 buffer registration introduced - ptb v1.2.8.5 + * 99-7-3 some client redundancy reestablished - ptb v2.1.1 + * 99-7-10 encapsulated queue calls. One element rollback buffer - ptb v2.1.2 + * 99-7-20 timestamp and rollback old abandoned request - ptb v2.1.3 + * 99-7-24 64bit file sizes and offsets accepted - ptb v2.1.9 + * 99-7-26 experimental request coalesces - ptb v2.1.10 + * 99-7-27 partitioning scheme - ptb v2.2.1 + * 99-8-3 nbd_clr_sock bug in invalidate_device fixed? - ptb v2.2.4 + * 99-8-5 reverse replace of block_fsync, add sig ioctls - ptb v2.2.5 + * reverse bug introduced about v2.2.3 for compound reqs - ptb v2.2.5 + * fix clear_que bug (didn't rollback first) from 2.1.3 - ptb v2.2.5 + * 99-8-22 workaround strange nr_sectors bug - ptb v2.2.6 + * 99-8-11 fix MY_NBD_SYNC bug. Never sync'ed all - ptb v2.2.7 + * 99-8-12 wakeups all moved to enqueue - ptb v2.2.7 + * 99-8-23 remove slot->cli_age - ptb v2.2.7 + * 99-8-24 first 8 bytes of signature embedded in packets - ptb v2.2.8 + * fix SET_SIG define buglet, remove hardcoded constants - ptb v2.2.8 + * fix huge bug. Missing copy_fromfs in my_nbd_ack - ptb v2.2.8 + * removed signature embedding and all other decorations - ptb v2.2.8 + * 99-8-25 recast fix in my_nbd_ack to avoid align. bug - ptb v2.2.9 + * put in MKDEVs and put back some hardcode const fixes - ptb v2.2.10 + * 99-9-29 fix BLKGETSIZE bug - ptb v2.2.14 + * 99-10-2 run with interrupts on throughout. Think we lose some - ptb v2.2.15 + * 99-10-8 trim dead code, kernel 2.2 ifdef's - ptb v2.2.17 + * 99-12-18 further o-o - ptb v2.2.19 + * 99-12-28 queue account cleanup. endio on queue reqs at reset - ptb v2.2.20 + * interruptible semaphores for better client recovery - ptb v2.2.20 + * 00-1-2 debugging cleanups. Fix race in end_request - ptb v2.2.21 + * 00-1-4 semaphores simplified. - ptb v2.2.22 + * 00-6-8 emergency control by write to proc - ptb v2.2.24 + * 00-7-20 ported to 2.4.0-test1. Possible minor bugs found/fixed - ptb v2.2.24 + * 00-7-27 changed proc i/f to read_proc from get_info in 2.2/2.4 - ptb v2.2.25 + * 00-7-30 fixed reads before writes under 2.4 by disabling merge - ptb v2.2.25 + * 00-7-30 and fixed merge_reqs for 2.4, now that I understand! - ptb v2.2.25 + * 00-7-30 fixed/introduced possible bug in end_io for 2.2/2.4 - ptb v2.2.25 + * 00-7-30 added timeval/zone field in requests and replies - ptb v2.4.0 + * 00-7-30 fixed hitherto masked bug in read_stat in nbd_client - ptb v2.4.0 + * 00-7-30 added timeout to net writes in nbd_client - ptb v2.4.0 + * 00-8-20 display fix for devices over 2GB - ptb v2.4.5 + * 00-8-23 more 64 bit fixes + error out overrange requests- ptb v2.4.6/2.2.27 + * 00-8-31 add NBD_ERR ioctl to error out slot request- ptb v2.4.9 + * 00-8-31 soften NBD_SOFT_RESET so doesn't wreck protocol - ptb v2.4.9 + * 00-9-1 remove %L's from printfs. Kernel 2.2. doesn't - ptb v2.4.10/2.2.27 + * 00-9-6 add various state flags to help init order - ptb v2.4.11 + * 00-9-8 add checks for device initialised to set_sock - ptb v2.4.12 + * 00-9-17 en/disable device as aslot count goes through 0 - ptb v2.4.13/2.2.28 + * 00-9-21 split read/write dev req counts for accounting - ptb v2.4.14 + * 00-9-21 renamed sync_intvl to req_timeo - ptb v2.4.14 + * 00-9-21 made sync_intvl count write blocks - ptb v2.4.14 + * 00-9-22 repair enable after delayed disable when disabled - ptb v2.4.14 + * 00-9-22 include sync (nonblocking) after sync_intvl reqs - ptb v2.4.14 + * 00-9-25 disable sync (nonblocking) after sync_intvl reqs - ptb v2.4.14 + * 00-9-25 bundle invalidate_buffers in clr_sock - ptb v2.4.14 + * 00-10-20 implement req_timeo per device + ioctl (Wang Gang) - ptb v2.4.15 + * 00-10-20 add raid mode (Wang Gang) - ptb v2.4.15 + * 00-10-26 throttle in do_req - ptb v2.4.15 + * 00-10-28 do set_sock on first open and clr_sock on last close - ptb v2.4.15 + * 00-11-01 make sync_intvl really sync - ptb v2.4.15 + * 00-11-14 rename throttle to plug, nbd_sync takes arg - ptb v2.4.17 + * 00-11-19 clr_sock errs req not rollback if show_errs & !aslot - ptb v2.4.17 + * 00-11-20 removed autodeadlock when disabled in do_req end_req - ptb v2.4.17 + * 00-11-21 make MY_NBD_SYNC only sync when sync_intvl > 0 - ptb v2.4.17 + * 00-12-24 make MY_NBD_GET_REQ use a timeout arg - ptb v2.4.18 + * 01-02-12 ported to 2.4.0 (works). do_nbd_request rewritten - ptb v2.4.20 + * 01-02-20 managed to get plugging and clustered read/writes OK - ptb v2.4.21 + * 01-02-21 eliminated slot->buflen for the time being - ptb v2.4.21 + * 01-02-27 added proper devfs support - ptb v2.4.22 + * 01-03-15 allowed more devices/in devfs, cleaned up endio - ptb v2.4.23 + * 01-03-15 added device letter (<= 3 chars) to struct- - ptb v2.4.23 + * 01-03-15 added request size check to do_nbd_req - ptb v2.4.23 + * 01-03-15 increased MAX_SECTORS to 512 by default - ptb v2.4.23 + * 01-03-15 made major number a module parameter - ptb v2.4.23 + * 01-03-18 added max_sectors array - ptb v2.4.23 + * 01-03-23 added devfs links - ptb v2.4.23 + * 01-04-17 plugging always enabled for 2.4 kernels - ptb v2.4.24 + * 01-04-17 made SET_RO set_device_ro as well as set local flags - ptb v2.4.25 + * 01-04-28 impl SET_MD5SUM ioctl and proc support for md5sum - ptb v2.4.25 + * 01-04-29 added accounting for md5'd reqs - ptb v2.4.25 + * 01-07-29 added atomic protections for accounting - ptb v2.4.25 + * 01-08-01 fixed 2.4 smp bugs. Interrupts off in spinlocks - ptb v2.4.25 + * 01-08-01 removed all semaphores for spinlocks - ptb v2.4.25 + * 01-08-01 invalidate_buffers in clr_sock (req'd Rogier Wolff) - ptb v2.4.25 + * 01-08-02 fixed smp deadlock - end_that_request_first slept! ptb v2.4.26 + * 01-10-16 provisionally added error in device open when notenabled ptb v2.4.27 + * 01-10-18 added DIRTY flag to save on repeated invalidate_buffers ptb v2.4.27 + * 01-10-31 increment seqno_out before delivery, so really starts at 1 v2.4.27 + * 01-11-01 move zeroing of seqno in cmd field to nbe_end_req* ptb v2.4.27 + * 01-11-18 add speed calculation, dev fields, display in proc ptb v2.4.27 + * 01-11-20 modifications for compiling into monolithic kernel ptb v2.4.27 + * 01-12-06 clr requests before reenabling, not after, in nbd_enable ptb 2.4.27 + * 02-02-21 make nbd_rollback modal, absirbing nbd_error ptb 2.4.27 + * 02-08-08 added local BLKSSZGET (reject) and related ioctls ptb 2.4.30 + * 02-08-12 make nbd_ack not ruin req when its rolled back already ptb 2.4.30 + * 02-09-18 fix __FUNCTION__ for new gcc ptb 2.4.30 + * 02-09-18 always allow daemon death even with reqs waiting ptb 2.4.30 + * 02-09-18 eliminate SYNC_REQD, RLSE_REQD ptb 2.4.30 + * 02-09-18 eliminate speed_lim ptb 2.4.30 + * 02-09-18 fix countq accounting ptb 2.4.30 + * 02-09-18 encapsulate remote ioctl handling ptb 2.4.30 + * 02-09-18 remote ioctl uses kernel req, not our fake one ptb 2.4.30 + * 02-09-18 eliminated ctldta use (too much tricky logic) ptb 2.4.30 + * 02-09-28 handle req specials ptb 2.4.30 + * 02-10-10 introduce DIRECT flag ptb 2.4.30 + * 02-10-13 rollback pushes reqs to local queue, not queues them! ptb 2.4.30 + * 02-10-13 add hooks for separate ioctl module ptb 2.4.30 + * 02-10-16 take set_sock out of open. Put pid check in handshake ptb 2.4.30 + * 02-10-16 define MY_NBD_GET_NPORT ioctl ptb 2.4.30 + * 02-10-18 remove wait from MY_NBD_SYNC ioctl ptb 2.4.30 + * 02-10-20 rollback adds requests to queue in seqno order ptb 2.4.30 + * 02-10-23 introduce and use pid_sem instead of req_sem ptb 2.4.30 + * 02-10-30 support client fallback to ioctls on whole disk ptb 2.4.30 + * 02-11-3 moved set INITIALISED up to coincide with setting inode ptb 2.4.30 + * 02-11-3 add media check and revalidate routines ptb 2.4.30 + * 02-11-4 encapuslate lives++ and ENABLED changes into nbd_enable ptb 2.4.30 + * 02-11-4 set_enable from proc only enables, not clears queue ptb 2.4.30 + * 11-11-4 take blk_put_request out of end_request (it locks!) ptb 2.4.30 + * 11-11-4 replace list_del by list_del_init ptb 2.4.30 + * 02-12-7 nbd_release made aware of daemons on whole disk ptb 2.4.30 + * 03-01-7 added ioctls for setfaulty etc. ptb 2.4.31 + * 03-02-1 used metalock for non-queue changes ptb 2.4.31 + * 03-03-12 add md_list notification ioctls ptb 2.4.31 + */ + +#include +#ifndef UNIX98_PTY_MAJOR_COUNT + #define UNIX98_PTY_MAJOR_COUNT 8 + #ifndef UNIX98_NR_MAJORS + #define UNIX98_NR_MAJORS=UNIX98_PTY_MAJOR_COUNT + #endif +#endif + +#include + +#if defined(__GNUC__) && __GNUC__ >= 2 +#define _LOOSE_KERNEL_NAMES +#endif + +#include + +#include +#include +#include +#include + +#include /* PTB - when did this arrive in kernel? */ +#include +#include + +#define MAJOR_NR NBD_MAJOR +static int major = MAJOR_NR; + +#include +#include +#include + +#include /* PTB - when did this arrive in kernel? */ + +#include + +#include + +#include +#include +#include +#include +#include + +/* * + * PTB --------------- compatibility ------------------- * + * layer starts here. * + */ + + /* + * PTB BH_Protected disappeared somewhere around 2.4.10 but this is + * still needed for the very rare write local/read remote mode. DOn't + * worry about it in normal operation! + */ + #define mark_buffer_protected(rbh) \ + { \ + mark_buffer_dirty (rbh); \ + mark_buffer_uptodate (rbh, 1); \ + refile_buffer (rbh); \ + } + + /* PTB list interface extensions */ + #define list_head(ptr, type, member) \ + (list_empty(ptr)?NULL:list_entry(((struct list_head *)ptr)->next,type,member)) + #define list_tail(ptr, type, member) \ + (list_empty(ptr)?NULL:list_entry(((struct list_head *)ptr)->prev,type,member)) + + /* PTB for arches without the atomic mask ops (and no smp, I think!) + * - feel free to correct with assembler + */ + #ifndef atomic_set_mask + #define atomic_set_mask(mask, x) (x)->counter |= (mask) + #endif + #ifndef atomic_clear_mask + #define atomic_clear_mask(mask, x) (x)->counter &= ~(mask) + #endif + +/* * + * PTB --------------- compatibility ------------------- * + * layer ENDS here. * + */ + +int linux_version_code = LINUX_VERSION_CODE; + +#include +#include +#include + +/* + * PTB kernel data - 4KB worth + * We need space for nda, nda1, .. nda15, ndb, ndb1, .. + * The index is exactly the minor number. + */ + static int nbd_blksizes[MAX_NBD * NBD_MAXCONN]; + static int nbd_sizes[MAX_NBD * NBD_MAXCONN]; + static __u64 nbd_bytesizes[MAX_NBD * NBD_MAXCONN]; + static int nbd_max_sectors[MAX_NBD * NBD_MAXCONN]; + +/* + * PTB our data - about 3KB + * These are nda, ndb, ndc, ... + * Divide the minor by NBD_MAXCONN to get this index. + */ + static struct nbd_device nbd_dev[MAX_NBD]; + static spinlock_t nbd_lock = SPIN_LOCK_UNLOCKED; + static struct nbd_md nbd_md; + static struct nbd_ioctl_stub nbd_remote_ioctl; + + struct nbd_device * nbd_get(int i) { + return &nbd_dev[i]; + } + + #define NBD_FAIL( s ) { \ + NBD_DEBUG(1, s " (result %d).\n" , result ); \ + goto error_out; \ + } + #define NBD_HARDFAIL( s ) { \ + NBD_ERROR( s " (result %d).\n" , result ); \ + lo->harderror = result; \ + goto hard_error_out; \ + } + +/* + * PTB device parameters. These are module parameters too. + */ + + static int rahead = NBD_RAHEAD_DFLT;/* PTB - read ahead blocks */ + static int sync_intvl = NBD_SYNC_INTVL; /* PTB - sync every n secs/Kreqs */ + static int merge_requests /* PTB - bool, do request coalesce */ + = NBD_MERGE_REQ_DFLT; + static int buf_sectors = NBD_MAX_SECTORS; + /* PTB - user bufsize required */ + static int show_errs = 1; /* PTB - RAID mode? not usually */ + static int direct = 0; /* PTB - all opens are O_DIRECT */ + static int plug = NBD_PLUG_DFLT; + + static int md5sum = 0; /* PTB - use md5summing write proto */ + static int md5_on_threshold = 1000; /* PTB - reqs reqd to turn md5 on */ + static int md5_off_threshold = 10; /* PTB - errs reqd to turn md5 off */ + +#ifndef NO_BUFFERED_WRITES + static int buffer_writes = 0; /* PTB - act like ramd on write */ +#endif /* NO_BUFFERED_WRITES */ + +#if defined(MODULE) + MODULE_PARM (rahead, "i"); + MODULE_PARM (sync_intvl, "i"); + MODULE_PARM (merge_requests, "i"); + MODULE_PARM (buf_sectors, "i"); + MODULE_PARM (show_errs, "i"); + MODULE_PARM (direct,"i"); + #ifndef NO_BUFFERED_WRITES + MODULE_PARM (buffer_writes, "i"); + #endif /* NO_BUFFERED_WRITES */ + MODULE_PARM (major, "i"); + MODULE_PARM (md5sum, "i"); + MODULE_PARM (md5_on_threshold, "i"); + MODULE_PARM (md5_off_threshold, "i"); +#endif + + // PTB This pointer is initialised in nbd_init. + static struct request_queue * nbd_queue; + +#define NO_BUFFERED_WRITES 1 + +/* * + * PTB --------------- functions ----------------------- * + */ + +/* + * PTB + * Decode the request type of a request and return it. DOn't we + * have anywhere else to put this? Yes, in private data. But + * that's just a pointer to our device data so we don't use it. + * + * we use the low bit (REQ_RW) of the flags and the first high bit + * (REQ_NBD) to designate the type of request. + * + * @req the request to get the type of. + */ + +static int +rq_type (struct request *req) +{ + if (req->flags & REQ_SPECIAL) + return SPECIAL; + + switch ( ((req->flags & REQ_RW) ?1:0) + | ((req->flags & REQ_NBD)?2:0) + ) { + case 0: + return READ; + case 1: + return WRITE; + case 2: + return IOCTL; + case 3: + return MD5SUM; + } + // PTB report what we can of the strangeness if it is strange + return (req->flags < 4) ? -1: req->flags; +} + +/* + * PTB code the request type into a request. + * + * This appears to be only used when making an ioctl request and it + * never really escapes from our private area and it doesn't matter too + * much how efficient it is either. + * + * This function marks a request for conventional viewing as + * being of the designated conceptual type. It correspomds to the old + * "type" field in requests. + * + * @req the request to set the type on + * @type one of READ, WRITE, etc. + */ +static void +set_rq_type (struct request *req, int type) +{ + switch (type) { + case READ: + req->flags &= ~(REQ_RW | REQ_NBD | REQ_SPECIAL); + return; + case WRITE: + req->flags &= ~(REQ_NBD | REQ_SPECIAL); + req->flags |= REQ_RW; + return; + case IOCTL: + req->flags &= ~(REQ_RW | REQ_SPECIAL); + req->flags |= REQ_NBD; + return; + case MD5SUM: + req->flags &= ~REQ_SPECIAL; + req->flags |= REQ_RW | REQ_NBD; + return; + case SPECIAL: + req->flags |= REQ_RW | REQ_NBD | REQ_SPECIAL; + return; + } +} + +/* + * PTB count number of blocks in a request. This will be an overestimate + * if the number is not an exact multiple. It seems to happen. We + * guarrantee to return -ve only if the request is invalid. + * + * @req - request we want to count + */ +inline long +nr_blks (struct request *req) +{ + unsigned log_sectors_per_blk; + unsigned sectors_per_blk; + int size; + int sectors; + struct nbd_device *lo; + + if (!req) + return -EINVAL; + + if (rq_type(req) == REQ_SPECIAL) // PTB contains no data + return 0; + + lo = req->rq_disk->private_data; + + log_sectors_per_blk = lo->logblksize - 9; + sectors_per_blk = 1 << log_sectors_per_blk; + + sectors = req->nr_sectors; + size = (sectors + sectors_per_blk - 1) >> log_sectors_per_blk; + + return size; +} + +/* + * return a temporary buffer containing the (1 or 2 char) device letter. + * This works for i up to 26*26. 0 is "a". The buffer is zero + * terminated. + * + * @i number to be translated to x[y] alphabetical form. + */ +static char * +device_letter (int i) +{ + + static char buf[3]; + static int cached_i = -1; + + if (cached_i == i) + return buf; + + cached_i = i; + + if (i < 26) { + buf[0] = 'a' + i; + buf[1] = 0; + return buf; + } + + buf[0] = 'a' + i / 26; + buf[1] = 'a' + i % 26; + buf[2] = 0; + return buf; +} + +/* + * PTB auxiliary functions for manipulating the sequence number. Isn't + * there anything private we can use in a request? + * + * This function returns the sequno + * + * @req the request to get the sequence number of + */ +static int +rq_seqno (struct request *req) +{ + return req->flags >> __REQ_NBDSEQNO; +} +static void +rq_set_seqno (struct request *req, int val) +{ + // PTB preserve first __REQ_NR_BITS bits + req->flags &= REQ_NBDSEQNO - 1; + // PTB shift by one more than strictly necessary (see rq_seqno) + req->flags |= val << __REQ_NBDSEQNO; +} + +/* + * PTB sync the device. Modes: + * @arg = 1: Do it sync + * @arg = 0: Do it async + * + * We can't call sync_dev outside a process context. I don't know why. + * Death results from a scheduled attempt. + * + * Call without the semaphore held, as we lock it and call sync_dev. + */ +static void +nbd_sync (struct nbd_device *lo, long arg) +{ + struct inode *inode = lo->inode; + short minor, nbd, islot; + + islot = atomic_read (&lo->islot); + + if (!(atomic_read (&lo->flags) & NBD_INITIALISED) || !inode) { + goto fail; + } + + minor = minor (inode->i_rdev); + nbd = minor >> NBD_SHIFT; + + // PTB sync_dev is async. fsync_dev is sync. + switch (arg) { + case 0: // async + // PTB 2.5.7 does not have async sync! FIXME + break; + default: // sync + fsync_bdev (inode->i_bdev); + invalidate_buffers (mk_kdev (major, nbd << NBD_SHIFT)); + break; + } + + return; + + fail: +} + +static void +nbd_async_sync (struct nbd_device *lo) +{ + nbd_sync (lo, 0); +} +static void +nbd_sync_sync (struct nbd_device *lo) +{ + nbd_sync (lo, 1); +} + +/* + * Do sync async if we're enabled, sync if we're not. + * + * @lo the device to maybe sync (sync or async sync!) + */ +static void +nbd_maybe_sync_sync (struct nbd_device *lo) +{ + + if ((atomic_read (&lo->flags) & NBD_ENABLED) + && !(atomic_read (&lo->flags) & NBD_REMOTE_INVALID)) { + nbd_async_sync (lo); + return; + } + nbd_sync_sync (lo); +} + + + + +/* + * PTB - put a request onto the head of a nbd device's queue + * - presumably having taken it off the kernel's queue first! + * - We take the queue spinlock. + * + * @lo = the device we are on (could we get it from the req?) + * @req = the request we shift + * @irqsave = save and restore irqmask when taking our queue spinlock + */ +static void +nbd_enqueue (struct nbd_device *lo, struct request *req) +{ + unsigned long req_blks = nr_blks (req); + + if (req_blks < 0) { + short islot = atomic_read (&lo->islot); + NBD_ERROR ("(%d): invalid req %p. Not touching!\n", islot, req); + return; + } + + /* PTB accounting and nothing more - first, specials */ + if (! (req->flags & REQ_SPECIAL)) { + // PTB the special req counting semantics relies on + // countq not including itself in the count! + int countq; + int cmd; + cmd = rq_data_dir (req); + atomic_add (req_blks, &lo->requests_in[cmd]); + + // PTB do we need locks here? Apparently not. + atomic_inc (&lo->countq[cmd]); + countq = atomic_read (&lo->countq[cmd]); + + // PTB the maxes are just noncritical stats + if (atomic_read (&lo->maxq[cmd]) < countq) + atomic_set (&lo->maxq[cmd], countq); + atomic_inc (&lo->req_in[cmd][req_blks]); + // PTB the maxes are just noncritical stats + if (atomic_read (&lo->maxreqblks) < req_blks) + atomic_set (&lo->maxreqblks, req_blks); + } + + write_lock (&lo->queue_lock); + + list_add (&req->queuelist, &lo->queue); + + write_unlock (&lo->queue_lock); + + wake_up_interruptible (&lo->wq); + +} + +/* + * PTB - remove a request from anywhere in the nbd device general queue + * - return 0 for success, -ve for fail + * + * We need to hold the queue lock when calling this routine. + * It walks the queue. + * + * @lo the nbd device + * @req the request to be removed + */ +static int +nbd_remove (struct nbd_device *lo, struct request *req) +{ + int cmd; + + if (!req) + return -EINVAL; + + list_del_init (&req->queuelist); + + /* PTB accounting and nothing more */ + cmd = rq_data_dir (req); + atomic_dec (&lo->countq[cmd]); + return 0; +} + +/* + * PTB - Open the device. This is the blkops function. + */ +int +nbd_open (struct inode *inode, struct file *file) +{ + int dev; + struct nbd_device *lo; + int nbd; + int part; + int islot; + char *devnam; + + if (!inode && file) { /* added by ptb for 2.0.35. Necessary? */ + inode = file->f_dentry->d_inode; + } + if (!inode) { + NBD_ERROR ("null inode.\n"); + return -EINVAL; + } + + dev = minor (inode->i_rdev); + nbd = dev >> NBD_SHIFT; + part = dev - (nbd << NBD_SHIFT); + islot = part - 1; + + if (nbd >= MAX_NBD) { + NBD_ERROR ("too many (%d) whole devices open\n", nbd); + return -ENODEV; + } + + lo = &nbd_dev[nbd]; + devnam = lo->devnam; + + /* PTB provision for opening for direct i/o - gives mount aid */ + if (file + && (atomic_read(&lo->flags) & NBD_DIRECT) + && !(file->f_flags & O_DIRECT)) { + /* PTB we set NOFOLLOW to show we did it ! */ + file->f_flags |= O_DIRECT | O_NOFOLLOW; + } + + if (part == 0) { + /* PTB we have got the whole dev's file or inode for 1st time */ + if (!lo->file || lo->file != file) { + lo->file = file; + atomic_set (&(&lo->wspeed)->frstj, jiffies); + atomic_set (&(&lo->rspeed)->frstj, jiffies); + atomic_set (&(&lo->tspeed)->frstj, jiffies); + } + if (!lo->inode || lo->inode != inode) { + lo->inode = inode; + } + if (!(atomic_read (&lo->flags) & NBD_INITIALISED)) { + atomic_set_mask (NBD_INITIALISED, &lo->flags); + } + } + + atomic_inc (&lo->refcnt); + + if (!(atomic_read (&lo->flags) & NBD_VALIDATED) + && lo->aslot > 0 + && (atomic_read (&lo->flags) & NBD_ENABLED)) { + NBD_INFO ("partition check on device nd%s\n", lo->devnam); + check_disk_change(inode->i_bdev); + + /* + * PTB do we set VALIDATED here, or let the kernel call + * sequence result in it happening via our removable + * device routines? Let's go for the latter option. + */ + } + + return 0; +} + +/* + * PTB - complete a transaction irrefutably by taking it out of the + * - slot pending position it is in, and reporting end_request to kernel + * + * We are called without locks because our call to end request + * will take some sort of lock momentarily and we don't need + * locks because our request should already be off all queues. + * + * @slot the nbd_slot on which the req notionally was + * @req the poor defenceless kernel request about to be acked + */ +void +nbd_commit (struct nbd_slot *slot, struct request *req) +{ + + struct nbd_device *lo = slot->lo; + unsigned long req_blks = nr_blks (req); + int cmd; + + if (req_blks < 0) { + NBD_ERROR ("corrupted req %p. Not touching with bargepole.\n", + req); + return; + } + + list_del_init (&req->queuelist); + + nbd_end_request_lock (req); + blk_put_request (req); + + slot->req_age = 0; + slot->req -= req_blks; + + /* PTB accounting and nothing more */ + cmd = rq_data_dir (req); + + atomic_sub (req_blks, &lo->requests_req[cmd]); + if (req->errors != 0) { + /* PTB error exit */ + atomic_add (req_blks, &lo->requests_err); + slot->err += req_blks; + return; + } + + atomic_add (req_blks, &lo->requests_out[cmd]); + slot->out += req_blks; + + if (cmd != WRITE) + /* PTB everything but a write was easy */ + return; + + /* + * PTB now non error case writes + * + * account the 4 cases for a md5sum'd transaction + */ + + switch (slot->flags & (NBD_SLOT_MD5SUM | NBD_SLOT_MD5_OK)) { + + case NBD_SLOT_MD5SUM | NBD_SLOT_MD5_OK: + atomic_add (req_blks, &lo->wrequests_5to); // 11 + atomic_add (req_blks, &lo->wrequests_5so); + // PTB zero the countdown to turning off md5 as it works + atomic_set (&lo->wrequests_5co, 0); + break; + + case NBD_SLOT_MD5SUM: + atomic_add (req_blks, &lo->wrequests_5to); // 10 + atomic_add (req_blks, &lo->wrequests_5wo); + atomic_inc (&lo->wrequests_5co); + if (atomic_read (&lo->wrequests_5co) > md5_off_threshold) { + atomic_set (&lo->wrequests_5co, 0); + // PTB turn off md5summing as it is not successful + atomic_clear_mask (NBD_MD5SUM, &lo->flags); + } + break; + + case NBD_SLOT_MD5_OK: + atomic_add (req_blks, &lo->wrequests_5to); // 01 + atomic_add (req_blks, &lo->wrequests_5eo); + atomic_inc (&lo->wrequests_5co); + if (atomic_read (&lo->wrequests_5co) > md5_off_threshold) { + atomic_set (&lo->wrequests_5co, 0); + // PTB turn off md5summing as it is errored + atomic_clear_mask (NBD_MD5SUM, &lo->flags); + } + break; + + default: + case 0: + // PTB nobody asked for a md5 and nobdy gave one back + atomic_inc (&lo->wrequests_5no); + if (atomic_read (&lo->wrequests_5no) > md5_on_threshold) { + atomic_set (&lo->wrequests_5no, 0); + // PTB turn on md5summing every so often + atomic_set_mask (NBD_MD5SUM, &lo->flags); + } + break; + } + + // PTB clear the md5sum indicators from the slot afterwards! + slot->flags &= ~(NBD_SLOT_MD5SUM | NBD_SLOT_MD5_OK); + + // PTB we ran out of difficult cases, so return +} + +/* + * PTB - error out a transaction irrefutably by taking it out of the + * - slot pending position it is in, and reporting end_request to kernel + * + * We must be called without spinlocks held, as we take it in end req + * + * @slot the nbd_slot on which the req notionally was + * @req the poor defenceless kernel request about to be errored + */ +void +nbd_error (struct nbd_slot *slot, struct request *req) +{ + struct nbd_device *lo = slot->lo; + unsigned long req_blks = nr_blks (req); + int cmd; + + if (req_blks < 0) { + NBD_ERROR ("passed illegal request %p\n", req); + } + + req->errors++; + + /* + * PTB We don't need the queue spinlock since we don't touch our queue, + * and we're the only ones working on this slot. + */ + list_del_init (&req->queuelist); + + NBD_ALERT ("error out req %p from slot %d!\n", req, slot->i); + + nbd_end_request_lock (req); + blk_put_request (req); + + /* PTB accounting and nothing more */ + cmd = rq_data_dir (req); + atomic_sub (req_blks, &lo->requests_req[cmd]); + + slot->in -= req_blks; + slot->req -= req_blks; + + slot->req_age = 0; + slot->err += req_blks; + atomic_add (req_blks, &lo->requests_err); +} + +/* + * Take a request out of a slot. This must not hold the queuelock on + * entry as we take the queue lock in order to play with the devices + * queue. + * + * @slot the nbd slot on which to work + * @req the request + */ +static void +nbd_rollback (struct nbd_slot *slot, struct request *req) +{ + + struct nbd_device *lo = slot->lo; + unsigned long req_blks, flags; + int seqno; + struct list_head *pos; + struct request *xreq; + + if (atomic_read (&lo->flags) & NBD_SHOW_ERRS) { + nbd_error (slot, req); + return; + } + + req_blks = nr_blks (req); + + if (req_blks < 0) { + NBD_ERROR ("passed illegal request %p\n", req); + return; + } + + list_del_init (&req->queuelist); + + NBD_ALERT ("rollback req %p from slot %d!\n", req, slot->i); + + if (! (req->flags & REQ_SPECIAL)) { + /* PTB accounting */ + slot->in -= req_blks; + slot->req -= req_blks; + } + + seqno = rq_seqno(req); + + write_lock_irqsave(&lo->queue_lock, flags); + list_for_each_prev (pos, &lo->queue) { + xreq = list_entry (pos, struct request, queuelist); + if (rq_seqno(xreq) > seqno) { + break; + } + } + list_add_tail (&req->queuelist, pos); + write_unlock_irqrestore(&lo->queue_lock, flags); + +} + +/* + * PTB - undo transactions by taking them out of the slot pending + * - position and replacing them on the generic device queue + * - NB we do not hold the io request lock or queue sem when + * - calling this as we take it internall in nbd_rollback + * + * @slot the nbd slot to scan + */ +static void +nbd_rollback_all (struct nbd_slot *slot) +{ + + struct request *req; + short count = 0; + + while (!list_empty (&slot->queue)) { + + if (count++ > 1000) + break; + + req = list_head (&slot->queue, struct request, queuelist); + + if (!req) + break; + + nbd_rollback (slot, req); + } + +} + +/* + * PTB error out all the requests on a slot + * + * We must be called without the io spinlock held, as we take it in + * nbd_error(). + * + * @slot the nbd slot to scan + */ +static void +nbd_error_all (struct nbd_slot *slot) +{ + + struct request *req; + short count = 0; + + while (!list_empty (&slot->queue)) { + if (count++ > 1000) + break; + req = list_head (&slot->queue, struct request, queuelist); + if (!req) + break; + nbd_error (slot, req); + } +} + +/* + * PTB - let a request onto the slot pending position + * - Can be called without the spinlock and doesn't take the + * spinlock as we only deal with our unique slot. If there + * were more than one client per slot this woould be a problem + * but there aren't so it isn't. + * + * @slot the nbd slot to let the request onto + * @req the request to move onto the slot queue + */ +void +nbd_accept (struct nbd_slot *slot, struct request *req) +{ + + struct nbd_device *lo = slot->lo; + unsigned long req_blks = nr_blks (req); + int cmd; + + if (req_blks < 0) + return; + + /* PTB accounting and nothing more */ + cmd = rq_data_dir (req); + + atomic_add (req_blks, &lo->requests_req[cmd]); + /* PTB - Note that this really is slot and not lo. + */ + list_add (&req->queuelist, &slot->queue); + + slot->req_age = jiffies; + slot->in += req_blks; + slot->req += req_blks; +} + +/* + * PTB - read from userspace to a request buffer. Do it piecewuse + * - to cope with clustered requests. + * - return number of bytes read + * + * Unfortunately the only way we can return less than the right + * number of bytes is when the receiving req does not have the + * right number of buffers, because the copy_from_user itself + * doesn't tell us. + */ +static int +copy_from_user_to_req (struct request *req, char *user, int len) +{ + + unsigned size = 0; + struct bio *bio /* = req->bio */; + + /* PTB assume user verified */ + + rq_for_each_bio(bio, req) { + + int i; + struct bio_vec * bvl; + + bio_for_each_segment(bvl, bio, i) { + + struct page *page = bvl->bv_page; + int offset = bvl->bv_offset; + const unsigned current_size + = bvl->bv_len; + char *buffer; + buffer = page_address(page) + offset; + + copy_from_user (buffer, user + size, current_size); + + size += current_size; + } + } + if (size != len) { + NBD_ALERT ("requested %d and only read %d bytes to req %p\n", + len, size, req); + NBD_ALERT ("request %p wanted to read user space buffer %p\n", + req, user); + } + return size; +} + +/* + * PTB - andres' kernel half of the user-space network handshake, used + * - to complete a transaction. + * - return 0 for success and -ve for fail. + * + * @slot the nbd slot being acted on + * + */ +int +nbd_ack (struct nbd_slot *slot) +{ + struct nbd_reply reply; + struct request *req, *xreq; + int result = 0; + + void *user; + unsigned long req_blks = 1; + struct nbd_device *lo = slot->lo; + unsigned buflen = 0; + unsigned reqlen; + int cmd; + struct list_head *pos; + int count = 0; + + if (!(slot->flags & NBD_SLOT_BUFFERED)) { + return -EINVAL; + } + + atomic_inc (&lo->cthreads); + slot->flags |= NBD_SLOT_RUNNING; + slot->cli_age = jiffies; + + user = slot->buffer; + copy_from_user ((char *) &reply, (char *) user, + sizeof (struct nbd_reply)); + + // PTB we keep tracking the write position in the input buffer + buflen += NBD_BUFFER_DATA_OFFSET; + + // PTB save the reply handle (which is an address) as our req + memcpy (&req, &reply.handle, sizeof (req)); + + xreq = NULL; + list_for_each (pos, &slot->queue) { + xreq = list_entry (pos, struct request, queuelist); + if (count++ > 1000) + break; + if (xreq == req) + /* PTB found it */ + break; + } + + if (xreq != req) { + + if (slot->nerrs++ < 3) + NBD_ALERT ("fatal: Bad handle %p != %p!\n", + req, xreq); + + atomic_dec (&lo->cthreads); + slot->flags &= ~NBD_SLOT_RUNNING; + + NBD_ALERT("ignoring ack of req %p which slot does not have\n", + req); + + /* + * PTB we lie and say success because userspace got through to + * us OK and the req they missed has been rolled back and will + * be retransmitted by the kernel later and elsewhere + */ + return 0; + } + + if (reply.magic != NBD_REPLY_MAGIC) { + + if (slot->nerrs++ < 3) + NBD_ALERT ("Not enough reply magic in %s\n", + __FUNCTION__ ); + /* + * PTB returning -EAGAIN causes the client to pause 0.5s + * and throw its reply away, then return to service. We leave + * any request we have to age and be rolled back. + */ + return -EAGAIN; + } + + if (reply.error > 0 || req->errors > 0) { + /* PTB wasn't error++'ed before */ + req->errors++; + if (slot->nerrs++ < 3) + NBD_ALERT ("exited with reply error\n"); + /* PTB we handle this - it's a repmote error */ + NBD_FAIL ("remote error on request\n"); + } + + req_blks = nr_blks (req); + + reqlen = req->nr_sectors; + reqlen <<= 9; + + cmd = rq_type (req); + + switch (cmd) { + + unsigned long rcmd; + char * arg; + int size; + + case READ: + + // PTB We have to copy the buffer bit by bit in + // case the request is clustered. + + size = + copy_from_user_to_req (req, ((char *) user) + buflen, reqlen); + if (size < reqlen) { + NBD_ALERT + ("(%d): copy %dB from user to req %p failed (%d)\n", + slot->i, reqlen, req, size); + // PTB we could try again? We should investigate. + NBD_FAIL + ("exited because of bad copy from user\n"); + // PTB FIXME - think we want to discard and retry + } + + // PTB we keep tracking the write position in the buffer + buflen += size; + break; + + case WRITE: + /* + * PTB we want to know if the reply is md5summed, and if it is + * whether the md5sum is the same as the one on the + * request. But that's not something we can presently see + * from here as we don't make an md5sum in the kernel. + * So we have to rely on the reply flag from userspace. + * We transmit the information to the slot, as we can't + * keep it on the request. + */ + + switch (reply.flags & + (NBD_REPLY_MD5SUM | NBD_REPLY_MD5_OK)) { + + case NBD_REPLY_MD5SUM | NBD_REPLY_MD5_OK: + /* + * PTB we asked for an md5sum comparison and + * the two matched, so we skipped writing the request + */ + slot->flags |= (NBD_SLOT_MD5SUM | NBD_SLOT_MD5_OK); //11 + break; + case NBD_REPLY_MD5SUM: + // PTB the two differed, so we wrote the request + slot->flags |= NBD_SLOT_MD5SUM; + slot->flags &= ~NBD_SLOT_MD5_OK; // 10 + break; + case NBD_REPLY_MD5_OK: + // PTB the server refused the md5 request + slot->flags &= ~NBD_SLOT_MD5SUM; + slot->flags |= NBD_SLOT_MD5_OK; // 01 + break; + default: + case 0: + // PTB mobody asked for an md5sum comparison + slot->flags &= ~(NBD_SLOT_MD5SUM | NBD_SLOT_MD5_OK);//00 + break; + } + // PTB now we are all set up to do the accounting in commit etc. + break; + + case SPECIAL: + // PTB FIXME. Just temporary. + NBD_ALERT ("special req %p on slot %d\n", req, slot->i); + req->errors = 0; + goto success; + break; + + + case IOCTL: + + if (!(reply.flags & NBD_REPLY_IOCTL)) + NBD_ALERT ("ioctl reply to req %p has no ioctl flag\n", + req); + + // PTB the commit should emit the request notification + + rcmd = (long) req->special; + arg = req->buffer; + + if (cmd == -1l) { + result = -EINVAL; + NBD_FAIL ("unauthorized remote ioctl\n"); + } + + if (!(_IOC_DIR (cmd) & _IOC_READ)) { + break; + } + + /* + * PTB We saved ioctl size in req .. but only approximately, + * as nr_sectors. + */ + + /* + * PTB if we are reading, it should be to the local + * buffer arg, which points at lo->ctldata or other buffer + */ + + // PTB we are treating a saved local address or direct val + if (req->nr_sectors > 0) { + /* + * PTB sectors is an overestimate. Should be + * OK as we are reading from the client + * buffer which has plenty of room to spare. + */ + int size = req->nr_sectors << 9; + copy_from_user (arg, (char *) user + buflen, size); + buflen += size; + break; + } + + break; + } // PTB eswitch + goto success; + + success: + slot->nerrs = 0; + /* + * PTB - completion (or erroring) of transaction. + * note that nbd_commit will take a lock to do end_req + */ + nbd_commit (slot, req); + atomic_dec (&lo->cthreads); + slot->flags &= ~NBD_SLOT_RUNNING; + return 0; + + error_out: + /* PTB we will next do a client rollback on the slot from userspace. + * Right here we just skip the request. + * But .. don't error the request. We might have rolled it + * back and be referencing it. + */ + if (result != -EAGAIN && result != 0) { + req->errors += req_blks; + slot->err += req_blks; + } + result = result < 0 ? result : -ENODEV; + // PTB one client thread leaves + atomic_dec (&lo->cthreads); + slot->flags &= ~NBD_SLOT_RUNNING; + return result; +} + +/* + * PTB - write to userspace from a request buffer. Do it piecewuse + * - to cope with clustered requests. + * - return number of bytes written + */ +static int +copy_to_user_from_req (struct request *req, char *user, int len) +{ + + unsigned size = 0; + struct bio *bio /* = req->bio */; + + /* PTB assume user verified */ + + rq_for_each_bio(bio, req) { + + int i; + struct bio_vec * bvl; + + bio_for_each_segment(bvl, bio, i) { + + struct page *page = bvl->bv_page; + int offset = bvl->bv_offset; + const unsigned current_size + = bvl->bv_len; + char *buffer; + buffer = page_address(page) + offset; + + copy_to_user (user + size, buffer, current_size); + + size += current_size; + } + + } + return size; +} + +/* + * PTB do the devices three speed updates + * + * @lo the nbd device to do the update on + */ +static void +nbd_set_speed (struct nbd_device *lo) +{ + int r, w, t; + struct nbd_speed *wspd = &lo->wspeed; + struct nbd_speed *rspd = &lo->rspeed; + struct nbd_speed *tspd = &lo->tspeed; + w = atomic_read (&lo->requests_in[WRITE]); + wspd->update (wspd, w); + r = atomic_read (&lo->requests_in[READ]); + rspd->update (rspd, r); + t = w + r; + tspd->update (tspd, t); +} + + + +/* + * PTB - andres' kernel half of the userspace networking. This part + * - initiates the transaction by taking a request off the generic + * - device queue and placing it in the slots pending position. + * - I believe we return 0 for success and -ve for fail. + * - timeo is the number of jiffies we are prepared to wait + * + * @slot the nbd slot to act on. + */ +int +nbd_get_req (struct nbd_slot *slot) +{ + struct nbd_request request; + struct request *req; + int result = 0; + static atomic_t count; + unsigned start_time = jiffies; + struct nbd_device *lo = slot->lo; + unsigned timeout = lo->req_timeo * HZ; + int islot = slot->i; + // PTB for the new timezone field in requests + extern struct timezone sys_tz; + struct timeval time; + unsigned long flags; + struct nbd_seqno * seqno_out = &lo->seqno_out; + + atomic_inc (&lo->cthreads); // PTB - client thread enters + slot->flags |= NBD_SLOT_RUNNING; + slot->cli_age = jiffies; + + if (!(slot->flags & NBD_SLOT_BUFFERED)) { + NBD_FAIL ("Our slot has no buffer"); + } + + atomic_set (&lo->islot, islot); + + if (!list_empty (&slot->queue)) { + NBD_FAIL ("impossible! already treating one request"); + // PTB we do a nontrivial rollback from the user daemon + } + if (!slot->file) { + result = -EBADF; + NBD_FAIL ("Our slot has been nofiled"); + } + if (!(atomic_read (&lo->flags) & NBD_ENABLED)) { + result = -ENODEV; + NBD_FAIL ("Our slot has been vamooshed"); + } + + atomic_inc (&lo->cwaiters); + slot->flags |= NBD_SLOT_WAITING; + + // PTB take spinlock in order to examine queue + // we need to protect ourselves against the request fn too + read_lock_irqsave (&lo->queue_lock, flags); + atomic_dec (&lo->cwaiters); + slot->flags &= ~NBD_SLOT_WAITING; + + // PTB - now spin until request arrives to treat + while (slot->file && list_empty (&lo->queue)) { + + static int nbd_clr_sock (struct nbd_slot *slot); // forward decl + int siz; + int time_left = start_time + timeout - jiffies; + + read_unlock_irqrestore (&lo->queue_lock, flags); + + // PTB one client thread goes to sleep + atomic_inc (&lo->cwaiters); + slot->flags |= NBD_SLOT_WAITING; + + interruptible_sleep_on_timeout (&lo->wq, time_left); + + slot->flags &= ~NBD_SLOT_WAITING; + // PTB one client thread reactivates + atomic_dec (&lo->cwaiters); + atomic_inc (&count); + + // PTB Have to take the spinlock again to check at the queue + atomic_inc (&lo->cwaiters); + slot->flags |= NBD_SLOT_WAITING; + // we need to protect ourselves against the request fn too + read_lock_irqsave (&lo->queue_lock, flags); + atomic_dec (&lo->cwaiters); + slot->flags &= ~NBD_SLOT_WAITING; + + // PTB fail for recheck if we are inactive too long + + time_left = start_time + timeout - jiffies; + if (time_left > 0 || !list_empty (&lo->queue)) + continue; + + // PTB bad. timeout with nothing on queue. Error out. + result = -ETIME; + + // PTB we will exit with fail, so up spinlock now + read_unlock_irqrestore (&lo->queue_lock, flags); + + siz = lo->blksize + sizeof (struct nbd_request); + // PTB verify the buffer is still OK - holds one block + if (access_ok(VERIFY_WRITE,slot->buffer,siz)) + goto error_out; + + // PTB buffer is invalid + result = -EINVAL; + + // PTB clr_sock takes both the io lock and the spinlock + nbd_clr_sock (slot); + NBD_FAIL ("Our process has died or lost its buffer"); + + /* + * PTB we may do a rollback from the user daemon here + * but it'll be trivial - without effect - as we don't + * have a request in our slot to treat. + */ + goto error_out; + + } // end while loop + + // PTB we still have the (read) spinlock here + + if (!(atomic_read (&lo->flags) & NBD_ENABLED)) { + read_unlock_irqrestore (&lo->queue_lock, flags); + result = -ENODEV; + NBD_FAIL ("Our slot vaporized while we slept!"); + } + if (!slot->file) { + read_unlock_irqrestore (&lo->queue_lock, flags); + result = -EBADF; + NBD_FAIL ("Our slot nofiled itself while we slept!"); + } + if (!list_empty (&slot->queue)) { + read_unlock_irqrestore (&lo->queue_lock, flags); + result = -EINVAL; + NBD_FAIL ("impossible! already treating one request"); + // PTB we do a nontrivial rollback from the user daemon + } + + // PTB now relinquish the read lock and try for the write lock + read_unlock_irqrestore (&lo->queue_lock, flags); + + write_lock_irqsave (&lo->queue_lock, flags); + // PTB got the write lock + + if (list_empty (&lo->queue)) { + write_unlock_irqrestore (&lo->queue_lock, flags); + // PTB - somebody else did it while we waited on spinlock. OK + result = -EINVAL; + NBD_FAIL ("ho hum beaten to the punch"); + // PTB we may do a trivial rollback from the user daemon + } + + // PTB cli/sti here looks unnec. hardware interrupts return here + // AMARIN begin uninterruptible code + + // PTB we have the (write) spinlock + + // PTB oldest=last element in queue + req = list_tail (&lo->queue, struct request, queuelist); + + // PTB this is where we free the req from our queue. We need to be + // holding our spinlock at this point + + // PTB - must succeed as have the spinlock + result = nbd_remove (lo, req); + // PTB now holding irqs off in nbd_remove + + // AMARIN end uninterruptable code + // PTB uh - maybe cli/sti is needed? interrupts can muck the queue? + // - Nah! I have left them enabled so we can see any errors. + + write_unlock_irqrestore (&lo->queue_lock, flags); + + request.magic = NBD_REQUEST_MAGIC; + request.flags = 0; + + switch (rq_type (req)) { + + unsigned long cmd; + char *arg; + size_t size; + + case IOCTL: + + request.type = IOCTL; + + // PTB this is our special ioctl kernel request + + cmd = (unsigned long) req->special; + arg = req->buffer; + size = req->nr_sectors << 9; + + // PTB the arg was a literal + + request.len = 0; + // PTB we are in get_req, transferring stored ioctl + if ((_IOC_DIR (cmd) & _IOC_READ) && size > 0) { + // PTB if len is +ve we copy to the user buffer later + request.len = size; + } + // PTB we store the weirded ioctl id. + // PTB Yes, this composition is our private invention. + request.from = (((__u64) cmd) << 32) + // PTB really want this to go to a 64 bit request.special + | ((__u64) (unsigned long) arg); + break; + + case READ: + case WRITE: + + request.type = rq_data_dir (req); + request.from = req->sector; + request.from <<= 9; + request.len = req->nr_sectors; + request.len <<= 9; + if (atomic_read (&lo->flags) & NBD_MD5SUM) { + // PTB set the please do md5sum flag on the request + request.flags |= NBD_REQUEST_MD5SUM; + } + break; + + case MD5SUM: + break; + + case SPECIAL: + request.type = SPECIAL; + request.len = req->nr_sectors; + request.len <<= 9; + request.from = req->sector; + request.from <<= 9; + if (rq_data_dir (req) == WRITE) + request.flags |= NBD_REQUEST_SPECIALRW; + request.special = (typeof(request.special))req->special; + break; + + default: + NBD_ALERT ("received unknown req %p type %#x\n", + req, rq_type (req)); + break; + } + + request.seqno = seqno_out->calc(seqno_out, rq_seqno (req)); + + /* + * PTB we should here erase the extra seqno info in the request + * so that on error or on ack the kernel can use the right internal + * array, but I'll erase it in the ack function instead + */ + + do_gettimeofday (&time); + request.time = time.tv_sec; + request.time *= 1000000; + request.time += time.tv_usec; + request.zone = sys_tz.tz_minuteswest; + + // PTB tz_dsttime = 0 always in linux + + memcpy (&request.handle, &req, sizeof (request.handle)); + + copy_to_user (slot->buffer, (char *) &request, sizeof (request)); + + switch (request.type) { + + int err; + char * arg; + + case READ: + break; + + case IOCTL: + if (request.len <= 0) + break; // PTB presumably nothing to do + arg = (char *) slot->buffer + NBD_BUFFER_DATA_OFFSET; + copy_to_user (arg, req->buffer, request.len); + break; + + case WRITE: + arg = (char *) slot->buffer + NBD_BUFFER_DATA_OFFSET; + err = copy_to_user_from_req (req, arg, request.len); + if (err >= request.len) + break; // OK + // PTB buffer had missing BHSs + NBD_ERROR ("req %p offered %d bytes of %d for copy to user\n", + req, result, request.len); + // PTB this request is badly damaged. We had better shoot it. + if (req && req->errors == 0) { + req->errors++; + nbd_end_request_lock (req); + blk_put_request (req); + } + NBD_FAIL ("kernel failed to keep req while we copied from it"); + break; + case MD5SUM: + break; + case SPECIAL: + // PTB temporary. We do not treat specials at the moment. + req->errors = 0; + break; + default: + NBD_ERROR ("req %p was type %#x\n", req, rq_type(req)); + NBD_FAIL ("unknown req type"); + break; + } + + /* + * PTB nbd_accept does not take spinlock and does not need to as + * the req is already free of the shared queue and only needs + * to be placed on the unique slot queue. + */ + + nbd_accept (slot, req); + + atomic_dec (&lo->cthreads); // PTB - client thread leaves normally + slot->flags &= ~NBD_SLOT_RUNNING; + + return 0; + + error_out: + // PTB accounting - a fail to get a request is not an errored request + atomic_dec (&lo->cthreads); // PTB - client thread leaves abnormally + slot->flags &= ~NBD_SLOT_RUNNING; + result = result < 0 ? result : -ENODEV; + + return result; +} + +/* + * PTB error out the pending requests on the kernel queue + * We have to be called WITHOUT the io request lock held. + * We sleep imbetween clearing each request, for "safety". + * + * @lo the nbd device to scan + */ +static int +nbd_clr_kernel_queue (struct nbd_device *lo) +{ + + int count = 0; + unsigned long flags; + request_queue_t *q = lo->q; + + spin_lock_irqsave (q->queue_lock, flags); + + while (! blk_queue_empty(q) && count++ < 1000) { + struct request *req; + req = elv_next_request(q); + if (!req) { // PTB impossible + spin_unlock_irqrestore (q->queue_lock, flags); + NBD_ALERT + ("impossible! kernel queue empty after tested nonemty!\n"); + goto fail; + } + blkdev_dequeue_request (req); + spin_unlock_irqrestore (q->queue_lock, flags); + if (!req->errors) + req->errors++; + schedule (); + nbd_end_request_lock (req); + blk_put_request (req); + spin_lock_irqsave (q->queue_lock, flags); + } + spin_unlock_irqrestore (q->queue_lock, flags); + goto success; + + fail: + /* PTB fall thru */ + success: + NBD_ALERT ("removed %d requests\n", count); + return count; + +} + +/* + * PTB error out the pending requests on the nbd queue and kernel queue + * Note that we take the queue spinlock for this + * + * @lo the nbd device to scan + */ +static int +nbd_clr_queue (struct nbd_device *lo) +{ + int count = 0; + + while (count < 1000) { + + struct request *req; + unsigned long req_blks = 1; + + // PTB cannot allow new requests via interrupts + write_lock (&lo->queue_lock); + if (list_empty (&lo->queue)) { + write_unlock(&lo->queue_lock); + break; + } + req = list_head (&lo->queue, struct request, queuelist); + if (!req) { + write_unlock(&lo->queue_lock); + break; + } + + req_blks = nr_blks (req); + + req->errors += req_blks + 1; + atomic_add (req_blks, &lo->requests_err); + + /* PTB - must succeed as have the spinlock */ + nbd_remove (lo, req); + /* PTB now hold irqs off in nbd_remove */ + write_unlock(&lo->queue_lock); + count++; + + nbd_end_request_lock (req); + blk_put_request (req); + + } + NBD_ALERT ("unqueued %d reqs\n", count); + return count; +} + +/* + * PTB do under alt spinlock - we take the lo queue_lock oursekves. + * We take all requests off the alt queue to which they've been + * diverted and put them on the devices normal queue, where they will + * then be treated in the normal course of events. They were diverted + * to the alt queue after we received a SPECIAL, and they're being + * released now that we've treated all the extant reqs. + * + * @lo the nbd device being treated + */ +static int +nbd_requeue (struct nbd_device *lo) +{ + int count = 0; + + while (count < 1000) { + + struct request *req; + + // PTB cannot allow new requests via interrupts + if (list_empty (&lo->altqueue)) { + break; + } + req = list_tail (&lo->altqueue, struct request, queuelist); + if (!req) + break; + + // PTB heisenbug? without these list_del oopses on null deref + if (req->queuelist.prev == NULL) { + NBD_ALERT ("req %p has 0 prev ptr! Abort\n", req); + break; + } + if (req->queuelist.next == NULL) { + NBD_ALERT ("req %p has 0 next ptr! Abort\n", req); + break; + } + /* PTB - must succeed as have the spinlock */ + list_del_init (&req->queuelist); + /* PTB now hold irqs off in nbd_remove */ + count++; + + nbd_enqueue (lo, req); + + } + return count; +} + + +#undef NBD_FAIL +#define NBD_FAIL( s... ) { \ + NBD_ERROR( s); printk("\n"); \ + goto error_out; \ +} + +#ifndef NO_BUFFERED_WRITES + /* + * Magic function from rd.c that we hope saves a buffer head + * permanently somewhere in the kernel VM system. + */ +static int +buffered_write_pagecache_IO (struct buffer_head *sbh, int nbd) +{ + struct address_space *mapping; + unsigned long index; + int offset, size, err; + struct nbd_device *lo = &nbd_dev[nbd]; + err = 0; + + // PTB we need to save the /dev/nda inode + if (!lo->inode) { + err = -ENODEV; + goto out; + } + mapping = lo->inode->i_mapping; + + // PTB index appears to be the page number + index = sbh->b_rsector >> (PAGE_CACHE_SHIFT - 9); + // PTB offset is in bytes, and says where in the page the sector starts + offset = (sbh->b_rsector << 9) & ~PAGE_CACHE_MASK; + // PTB well, an abbreviation for the buffer size, in bytes + size = sbh->b_size; + + do { + // PTB we mark each page that we should write to Uptodate + + int count; + struct page **hash; + struct page *page; + char *src, *dst; + + int unlock = 0; + + // PTB ummm, how much of the page is left to traverse + count = PAGE_CACHE_SIZE - offset; + // PTB reduce it to how much we actually need to traverse + if (count > size) + count = size; + // PTB say NOW? that we have traversed what we want of the page + size -= count; + + hash = page_hash (mapping, index); + page = __find_get_page (mapping, index, hash); + + if (!page) { + // PTB we get to make a new page + page = grab_cache_page (mapping, index); + if (!page) { + // PTB failed to get new page + err = -ENOMEM; + goto out; + } + // PTB magic + if (!Page_Uptodate (page)) { + memset (kmap (page), 0, PAGE_CACHE_SIZE); + kunmap (page); + SetPageUptodate (page); + } + // PTB the new page is locked. We need to unlock it later + unlock = 1; + } + + // PTB prepare already for next page + index++; + + // PTB set up for copy + dst = kmap (page); + dst += offset; + src = bh_kmap (sbh); + + // PTB prepare for next round + offset = 0; + + // PTB do a copy + memcpy (dst, src, count); + + kunmap (page); + bh_kunmap (sbh); + + if (unlock) { + UnlockPage (page); + } + SetPageDirty (page); + __free_page (page); + + } while (size > 0); + + out: + return err; + +} +static int +buffered_write (struct request *req) +{ + + struct buffer_head *bh; + int dev = minor (req->rq_dev); + int nbd = dev >> NBD_SHIFT; + int err = 0; + + // PTB go through and copy and protect the written buffers + for (bh = req->bh; bh; bh = bh->b_reqnext) { + struct buffer_head *rbh; + rbh = + getblk (bh->b_rdev, bh->b_rsector / (bh->b_size >> 9), + bh->b_size); + if (bh != rbh) { + char *bdata = bh_kmap (bh); + memcpy (rbh->b_data, bdata, rbh->b_size); + NBD_ALERT ("got new bh sector %lu on write\n", + bh->b_rsector); + } + bh_kunmap (bh); + mark_buffer_protected (rbh); // PTB equals dirty, uptodate + err = buffered_write_pagecache_IO (bh, nbd); + if (err < 0) { + break; + } + brelse (rbh); + } + return err; +} + +#endif /* NO_BUFFERED_WRITES */ + +/* + * PTB check if the device is read only according to int flags + * + * @lo the nbd device to be checked + */ +static int +nbd_read_only(struct nbd_device *lo) { + return (atomic_read(&lo->flags) & NBD_READ_ONLY) != 0; +} +/* + * PTB set the device readonly (or not) + * + * @lo the nbd device to be set up + * @ro 1 for read only, 0 for read write. + */ +static void +nbd_set_read_only(struct nbd_device * lo, int ro) { + + if (ro != 0) { + atomic_set_mask (NBD_READ_ONLY, &lo->flags); + } else { + atomic_clear_mask (NBD_READ_ONLY, &lo->flags); + } + + // PTB which device really does not matter. We do the checking. + set_disk_ro (lo->disk, ro != 0); +} + +/* + * PTB - kernel function to take reqs off the kernel queue. Runs with + * io lock held. This is the "request function". + */ +static void +do_nbd_request (request_queue_t * q) +{ + struct request *req; + unsigned long flags; + + while (! blk_queue_empty(q)) { + + struct nbd_device *lo; + + req = elv_next_request(q); + + lo = req->rq_disk->private_data; + + /* PTB - one kernel thread enters */ + atomic_inc (&lo->kthreads); + + if (atomic_read (&lo->kthreads) > atomic_read (&lo->kmax)) + atomic_set (&lo->kmax, atomic_read (&lo->kthreads)); + + if (!lo->inode || !lo->file) { + NBD_FAIL ("Request when device not ready."); + } + + if (rq_data_dir (req) == WRITE && nbd_read_only(lo)) { + NBD_FAIL ("write on read-only device"); + } + flags = atomic_read (&lo->flags); + if (!(flags & NBD_INITIALISED)) { + NBD_FAIL ("device not initialised."); + } + if (!(flags & NBD_ENABLED)) { + NBD_FAIL ("device not enabled."); + } + if (flags & NBD_REMOTE_INVALID) { + NBD_FAIL ("remote device invalidated."); + } + if (req->sector + req->nr_sectors > lo->sectors) { + NBD_FAIL ("overrange request"); + } + if (req->sector < 0) { + NBD_FAIL ("underrange request"); + } + if (req->rq_disk->major != major) { + NBD_FAIL ("request for wrong major"); + } + req->errors = 0; + blkdev_dequeue_request (req); + + // PTB in 2.5 we can release the iolock briefly here + spin_unlock_irq(q->queue_lock); + + if (req->flags & REQ_SPECIAL) { + // PTB temporary successful end here for SPECIALS + + // PTB we want to attach it to the device and ack later + nbd_enqueue (lo, req); + // PTB block further reqs until these have drained + write_lock(&lo->altqueue_lock); + // PTB do not touch this flag without this lock + if (atomic_read(&lo->countq[READ]) + + atomic_read(&lo->countq[WRITE]) > 0) { + atomic_set_mask(NBD_QBLOCKED, &lo->flags); + } + write_unlock(&lo->altqueue_lock); + goto accounting; + } + + // PTB we are the only reader and writer of lo->seqno + if (rq_data_dir (req) == WRITE && rq_seqno (req) == 0) { + // PTB it is a new request never seen before + struct nbd_seqno * seqno_out = &lo->seqno_out; + seqno_out->inc(seqno_out); + /* + * PTB we have to be careful to change this back before + * giving it back to the kernel, as the kernel uses it. + * We patch it back again in nbd_end_request. + */ + rq_set_seqno (req, seqno_out->get(seqno_out)); + } + + // if BLOCK is set divert requests to alt queue + write_lock(&lo->altqueue_lock); + if (atomic_read(&lo->flags) & NBD_QBLOCKED) { + list_add (&req->queuelist, &lo->altqueue); + write_unlock(&lo->altqueue_lock); + goto accounting; + } + write_unlock(&lo->altqueue_lock); + + // PTB normal sequence is to queue request locally + nbd_enqueue (lo, req); + goto accounting; + + accounting: + atomic_dec (&lo->kthreads); + // PTB regain the iolock for another turn + spin_lock_irq(q->queue_lock); + continue; // PTB next request + + error_out: + // PTB can rely on req being nonnull here + NBD_ALERT ("ending req %p with prejudice\n", req); + req->errors++; + blkdev_dequeue_request (req); + spin_unlock_irq(q->queue_lock); + + nbd_end_request_lock (req); + blk_put_request (req); + + // PTB more accounting + if (lo) { + int req_blks = nr_blks (req); + atomic_add (req_blks, &lo->requests_err); + atomic_dec (&lo->kthreads); + } else { + NBD_ALERT("failed to account one orphan errored req\n"); + } + // PTB regain the queue lock for another turn + spin_lock_irq(q->queue_lock); + continue; + } + return; +} + +/* + * PTB pair of helpful additional functions, only good for 1 bit in the + * mask, however. Modify if you want more. + * + * @a the atomic element's address + * @mask the integer with one bit set in the position that we want to test + * and set, or clear + */ +static int +atomic_test_and_set_mask (atomic_t * a, unsigned mask) +{ + int i = ffs (mask); + if (!i) + return -EINVAL; + // PTB gahhhh ... + #ifdef __LITTLE_ENDIAN + return test_and_set_bit (i - 1, (unsigned long *)&a->counter); + #else + #ifndef __BIGENDIAN + #error help, I only know about bigendian or littlendian machines + #endif + return test_and_set_bit + (i - 1 + (sizeof(long)-sizeof(a->counter))*8, + (unsigned long *)&a->counter); + #endif +} +static int +atomic_test_and_clear_mask (atomic_t * a, unsigned mask) +{ + int i = ffs (mask); + if (!i) + return 0; + // PTB gahhhh ... + #ifdef __LITTLE_ENDIAN + return test_and_clear_bit (i - 1, (unsigned long *)&a->counter); + #else + #ifndef __BIGENDIAN + #error help, I only know about bigendian or littlendian machines + #endif + return test_and_clear_bit + (i - 1 + (sizeof(long)-sizeof(a->counter))*8, + (unsigned long *)&a->counter); + #endif +} + + +/* + * PTB - set the enabled flag on a device (call without the spinlock held) + * + * @lo the nbd device being treated + */ +static void +nbd_enable (struct nbd_device *lo) { + unsigned long flags; + int did_enabled = 0; + struct nbd_md *md = &nbd_md; + + // PTB reenable part + write_lock_irqsave (&lo->meta_lock, flags); + if (!atomic_test_and_set_mask (&lo->flags, NBD_ENABLED)) { + // PTB was not enabled before + atomic_clear_mask (NBD_VALIDATED, &lo->flags); + lo->lives++; + did_enabled = 1; + } + write_unlock_irqrestore (&lo->meta_lock, flags); + + if (did_enabled) + md->notify(&nbd_md, mk_kdev (major, lo->nbd << NBD_SHIFT)); +} + + +/* + * PTB rollback all requests on a given slot and then invalidate it + * (so the requests can't go back until somebody reactivates the slot) + * At least rollback (which we call takes both the io spinlock and our + * spinlock, so we can hold neither when we are called. Soft_reset + * (which we call) also calls rollback, so has the same problem. + * + * @slot the nbd slot being treated + */ +static int +nbd_clr_sock (struct nbd_slot *slot) +{ + int i = 0; + struct nbd_device *lo = slot->lo; + int islot = slot->i; + unsigned long flags; + int do_reset = 0; + int do_enable = 0; + static int nbd_soft_reset (struct nbd_device*); + + nbd_rollback_all (slot); + + slot->file = NULL; + slot->bufsiz = 0; + slot->flags = 0; + slot->buffer = NULL; + + write_lock_irqsave (&lo->meta_lock, flags); + + /* PTB reset lo->aslot */ + + if (lo->aslot > 0) { + + /* PTB grr .. do this the hard way */ + int aslot = 0; + for (i = 0; i < lo->nslot; i++) { + struct nbd_slot *sloti = &lo->slots[i]; + if (sloti->file) + aslot++; + } + lo->aslot = aslot; + + if (lo->aslot <= 0) { + // PTB we were the last client alive, diasable device + if (atomic_read (&lo->flags) & NBD_SHOW_ERRS) { + // PTB soft_reset will invalidate_buffers + atomic_clear_mask (NBD_ENABLED, &lo->flags); + do_reset = 1; + } + } else if (!(atomic_read (&lo->flags) & NBD_ENABLED)) { + // PTB must not call reenable as that clears the queue + do_enable = 1; + } + + } + + // PTB lift the lock temporarily + write_unlock_irqrestore(&lo->meta_lock, flags); + if (do_reset) { + nbd_soft_reset (lo); + } + if (do_enable) { + nbd_enable (lo); + NBD_ALERT ("enabled device nd%s\n", lo->devnam); + } + write_lock_irqsave(&lo->meta_lock, flags); + + /* PTB reset lo->islot, for no good reason */ + + if (atomic_read (&lo->islot) == islot) { + for (i = 0; i++ < lo->nslot;) { + atomic_inc (&lo->islot); + if (atomic_read (&lo->islot) >= lo->nslot) + atomic_set (&lo->islot, 0); + if (lo->slots[atomic_read (&lo->islot)].file) + break; + } + } + lo->harderror = 0; + write_unlock_irqrestore (&lo->meta_lock, flags); + + /* PTB don't clear whole device queue as we might still be open */ + + return 0; +} + +/* + * PTB - check all slots for old requests and roll them back. + * At least rollback (which we call takes both the io spinlock and our + * spinlock, so we can hold neither when we are called. + * + * @lo the nbd device to scan + */ +static void +nbd_rollback_old (struct nbd_device *lo) +{ + + int islot; + + for (islot = 0; islot < lo->nslot; islot++) { + struct nbd_slot *slot = &lo->slots[islot]; + if (slot->req_age > 0 + && slot->req_age < jiffies - lo->req_timeo * HZ) { + nbd_rollback_all (slot); + } + } + +} + +/* + * PTB - register a socket to a slot. + * - Return 0 for success and -ve for failure. + * Nowadays this doesn't do very much! Just finalizes things. + * + * @slot the nbd slot being registered + */ +static int +nbd_set_sock (struct nbd_slot *slot, int arg) +{ + + struct nbd_device *lo = slot->lo; + int islot = slot->i; + unsigned long flags; + int do_enable = 0; + + if (!(atomic_read (&lo->flags) & NBD_INITIALISED)) { + NBD_ALERT ("(%d) device nd%s not initialised yet!\n", + islot, lo->devnam); + return -ENODEV; + } + if (!(atomic_read (&lo->flags) & NBD_SIZED)) { + NBD_ALERT ("(%d) device nd%s not sized yet!\n", islot, + lo->devnam); + return -EINVAL; + } + if (!(atomic_read (&lo->flags) & NBD_BLKSIZED)) { + NBD_ALERT ("(%d) device nd%s not blksized yet!\n", islot, + lo->devnam); + return -EINVAL; + } + if (!(atomic_read (&lo->flags) & NBD_SIGNED)) { + NBD_ALERT ("(%d) setting unsigned device nd%s! But harmless.\n", + islot, lo->devnam); + return -EINVAL; + } + + down (&lo->pid_sem); + + if (slot->pid != current->pid) { + if (jiffies > slot->cli_age + 2 * HZ * lo->req_timeo) { + NBD_ALERT + ("(%d) dead client process %d has nd%s%d, erasing pid!\n", + islot, slot->pid, lo->devnam, islot + 1); + slot->pid = 0; + } else { + NBD_ALERT + ("(%d) other live client process %d has nd%s%d!\n", + islot, slot->pid, lo->devnam, islot + 1); + } + up (&lo->pid_sem); + return -EINVAL; + } + up (&lo->pid_sem); + + slot = &lo->slots[islot]; + + // PTB this is a queue critical code region for the flags business + write_lock_irqsave (&lo->meta_lock, flags); + + // PTB file has to be nonzero to indicate we are all set up. + slot->file = (void *) (unsigned long) (arg+1 > 0 ? arg+1 : 1); + + if (islot >= lo->nslot) { + lo->nslot = islot + 1; + NBD_INFO ("increased socket count to %d\n", lo->nslot); + } + + lo->harderror = 0; + + if (lo->disk && !get_capacity(lo->disk)) { + set_capacity(lo->disk, lo->sectors); + } + if (++lo->aslot > 0) { + do_enable = 1; + } + // PTB end of queue critical region + write_unlock_irqrestore (&lo->meta_lock, flags); + + /* + * PTB if this is the first slot, we might call reenable and + * thus clr queue too, but reenable takes the spinlock + */ + if (do_enable) + nbd_enable(lo); + + return 0; +} + +/* + * PTB - return the index i of 2^i + j, 0 <= j < 2^i + */ +static inline unsigned +log2 (unsigned arg) +{ + unsigned log = 0; + while ((arg >>= 1) > 0) + log++; + return log; +} + +/* + * PTB - set the blksize in bytes of the block device. Return 0 for + * - success and -ve for failure. + */ +static int +nbd_set_blksize (struct nbd_device *lo, unsigned int arg) +{ + int nbd = lo->nbd; + if (arg > PAGE_SIZE || arg < 512 || (arg & (arg - 1))) { + NBD_ERROR ("blksize too big (%u)\n", arg); + return -EINVAL; + } + lo->blksize = nbd_blksizes[nbd << NBD_SHIFT] = arg; + lo->logblksize = log2 (lo->blksize); + set_blocksize(lo->inode->i_bdev, lo->blksize); + atomic_set_mask (NBD_BLKSIZED, &lo->flags); + return 0; +} + +/* + * PTB - set the size in bytes of the block device. Return 0 for + * - success and -ve for failure. + */ +static int +nbd_set_size (struct nbd_device *lo, __u64 arg) +{ + int nbd = lo->nbd; + lo->bytesize = nbd_bytesizes[nbd << NBD_SHIFT] = arg; + lo->size = nbd_sizes[nbd << NBD_SHIFT] = arg >> 10; + lo->sectors = lo->size << 1; + if (lo->inode && lo->inode->i_bdev && lo->inode->i_bdev->bd_inode) + lo->inode->i_bdev->bd_inode->i_size = arg; + if (lo->disk) + set_capacity (lo->disk, arg >> 9); + atomic_set_mask (NBD_SIZED, &lo->flags); + return 0; +} + +/* WG */ +static int +nbd_set_intvl (struct nbd_device *lo, int arg) +{ + if (arg <= 0) { + NBD_ERROR ("bad pulse interval/req timeout value (%d)\n", arg); + return -EINVAL; + } + lo->req_timeo = arg; + return 0; +} + +static int +nbd_set_spid (struct nbd_slot *slot, int arg) +{ + short spid = arg; + if (arg < 0 || arg >= (1 << (sizeof (short) * 8))) { + NBD_ERROR ("bad spid value (%d)\n", arg); + return -EINVAL; + } + slot->spid = spid; + return 0; +} + +static int +nbd_set_bufferwr (struct nbd_device *lo, int arg) +{ + if (arg) { + atomic_set_mask (NBD_BUFFERWR, &lo->flags); + } else { + atomic_clear_mask (NBD_BUFFERWR, &lo->flags); + } + return 0; +} + +static int +nbd_set_remote_invalid (struct nbd_device *lo, int arg) +{ + /* + * PTB we handle the event ourself exactly when it happens + * instead of letting the kernel have check_media defined + * and doing it there (and reporting 0 to the kernel) + */ + unsigned long flags; + int do_invalidate = 0; + kdev_t dev = mk_kdev (major, lo->nbd << NBD_SHIFT); + + if (arg == 0) { + atomic_clear_mask (NBD_REMOTE_INVALID, &lo->flags); + return 0; + } + + write_lock_irqsave (&lo->meta_lock, flags); + if (!(atomic_test_and_set_mask (&lo->flags, NBD_REMOTE_INVALID))) { + /* + * PTB this tells the kernel that next open + * should cause recheck .. we'll agree not to + * say we're happy until VALID is set again + */ + atomic_clear_mask (NBD_VALIDATED, &lo->flags); + // PTB test removing partitions + do_invalidate = 1; + } + write_unlock_irqrestore (&lo->meta_lock, flags); + + if (do_invalidate) { + // PTB destroy buffers + __invalidate_buffers (dev, 1); + NBD_ALERT ("invalidating remote on nd%s\n", lo->devnam); + // PTB - clear buffers now instead of waiting for kernel + // PTB that will cause requests to start being errored + invalidate_device (dev, 0); + } + + return 0; +} +/* + * Return the first slot index free when asking for n new ones. + * If there s no such gap, then NBD_MAXCONN will be returned. + * The return is always in the same argument address. + */ +static int +nbd_get_nport (struct nbd_device *lo, int *arg) +{ + int err, nslot, i; + + if (arg == NULL) { + return -EINVAL; + } + + nslot = *arg; + err = copy_from_user ((char *) &nslot, arg, sizeof (int)); + if (err < 0) { + return err; + } + + for (i = 0; i < NBD_MAXCONN; i++) { + struct nbd_slot *sloti = &lo->slots[i]; + int j; + if (sloti->file) { + continue; + } + + for (j = i; j < NBD_MAXCONN && j < i + nslot; j++) { + if (sloti->file) + break; + } + if (j == i + nslot) { + + break; + } + } + + err = copy_to_user (arg, (char *) &i, sizeof (int)); + return err; +} + + +/* + * PTB - if we're not signed, accept new sig and return success. + * - if we are signed, compare the offer and return success if equal, + * - and -ve for failure. + * + * @slot the slot we're working on + * @sig the string of signature chars (accessed as int *) + */ +static int +nbd_set_sig (struct nbd_slot *slot, int *sig) +{ + int err = 0; + int buf[NBD_SIGLEN / sizeof (int)]; + int islot = slot->i; + struct nbd_device *lo = slot->lo; + + if (!access_ok (VERIFY_READ, (char *) sig, NBD_SIGLEN)) { + NBD_ALERT ("(%d): failed sigcheck with bad user address %p\n", + islot, sig); + err = -EINVAL; + return err; + } + down (&lo->pid_sem); + + if (slot->pid == 0) { + slot->pid = current->pid; + slot->cli_age = jiffies; + } + if (slot->pid != current->pid) { + if (jiffies > slot->cli_age + 2 * HZ * lo->req_timeo) { + NBD_ALERT + ("(%d): dead process %d was setting sig, erasing pid\n", + islot, slot->pid); + slot->pid = 0; + } else { + NBD_ALERT + ("(%d): live process %d is trying to set sig\n", + islot, slot->pid); + } + up (&lo->pid_sem); + return -EINVAL; + } + + if (!(atomic_read (&lo->flags) & NBD_SIGNED)) { + /* PTB first time grab sig */ + copy_from_user ((char *) lo->signature, (char *) &sig[0], + NBD_SIGLEN); + atomic_set_mask (NBD_SIGNED, &lo->flags); + up (&lo->pid_sem); + return 0; + } + copy_from_user ((char *) buf, (char *) &sig[0], NBD_SIGLEN); + + /* PTB test for equality */ + + if (memcmp (&buf[0], &lo->signature[0], NBD_SIGLEN / sizeof (int)) + != 0) { + err = -EINVAL; + up (&lo->pid_sem); + NBD_ALERT ("(%d): failed sigcheck wth %d\n", islot, err); + return err; + } + up (&lo->pid_sem); + err = 0; + return err; +} + +/* + * PTB - register a userspace buffer to a slot. Return 0 for success + * - and -ve for failure. Null arg acts as erase. + */ +static int +nbd_reg_buf (struct nbd_slot *slot, char *buffer) +{ + + int err = 0, siz; + struct nbd_device *lo = slot->lo; + + if (!buffer) { + slot->flags &= ~NBD_SLOT_BUFFERED; + slot->buffer = NULL; + slot->bufsiz = 0; + return 0; + } + + siz = lo->max_sectors << 9; + + /* verify the buffer is in the process space */ + if (!access_ok (VERIFY_WRITE, buffer, siz)) { + err = -EINVAL; + return err; + } + /* PTB hope the buffer is as big as it should be - FIXME */ + slot->buffer = buffer; + slot->bufsiz = siz; + + /* PTB let the device bufsiz be min of registered nonzero bufsizes */ + if (!lo->bufsiz) { + // PTB first time + lo->bufsiz = siz; + } else { + if (lo->bufsiz > siz) + lo->bufsiz = siz; + } + + // PTB just in case the buffer really is small, we reset all the + // kernels request maxima if we have to adjust the device max + if (lo->max_sectors < (lo->bufsiz >> 9)) { + int j; + lo->max_sectors = lo->bufsiz >> 9; + for (j = 0; j < NBD_MAXCONN; j++) { + nbd_max_sectors[(lo->nbd << NBD_SHIFT) + j] = + lo->max_sectors; + } + } + + slot->flags |= NBD_SLOT_BUFFERED; + return 0; +} + +/* + * PTB - this unsets the enabled flag on the device and then clears the + * - queue for the device.. Call without spinlock. + * + * @lo the nbd device to scan + */ +static int +nbd_disable (struct nbd_device *lo) +{ + struct nbd_md * md = &nbd_md; + + if (!lo || !(atomic_read (&lo->flags) & NBD_INITIALISED)) { + NBD_ALERT("nbd_disable called on bad device\n"); + return 0; + } + + if (atomic_test_and_clear_mask (&lo->flags, NBD_ENABLED)) { + NBD_ALERT ("disabled device nd%s\n", lo->devnam); + } + + md->unnotify(md, mk_kdev (major, lo->nbd << NBD_SHIFT)); + + // PTB have to recheck partitions on next open + if (atomic_test_and_clear_mask (&lo->flags, NBD_VALIDATED)) { + NBD_ALERT ("invalidated device nd%s\n", lo->devnam); + } + return 0; +} + + +/* + * PTB - reset the enabled flag on a device and then clear all queues + * ( call without the spinlock held ) and then enable again. + */ +static void +nbd_reenable (struct nbd_device *lo) +{ + + int m, n; + + if (!(atomic_read (&lo->flags) & NBD_INITIALISED)) + return; + if (lo->aslot <= 0) + return; + if ((atomic_read (&lo->flags) & NBD_ENABLED)) + return; + + m = nbd_clr_queue (lo); + // PTB - have to call clr_kernel_queue without the io_spinlock held + n = nbd_clr_kernel_queue (lo); + + nbd_enable(lo); +} + +/* + * This function launches a thread which wakes for a signal to reenable + * the device, and then sets the timer to deleiver the signal. + */ +static int +nbd_reenable_delay (struct nbd_device *lo, int delay) +{ + write_lock (&lo->meta_lock); + if (lo->reenable_time == 0) + lo->reenable_time = jiffies + delay * HZ; + write_unlock (&lo->meta_lock); + return 0; +} + + + +/* + * PTB - drains device queue. Disables device. + * At least rollback (which we call takes both the io spinlock and our + * spinlock, so we can hold neither when we are called. Also + * invalidate buffers, on request of Rogier Wolff. + */ +static int +nbd_soft_reset (struct nbd_device *lo) +{ + int j; + const int max_clrq_retries = 100; + if (!(atomic_read (&lo->flags) & NBD_INITIALISED) || lo->nslot <= 0) { + return -EINVAL; + } + /* + * PTB We push back the requests in the slot, in order to be able to + * vamoosh them in a moment. This is a race, surely? We ought to + * do this atomically or dsiable the slots first. + */ + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slot = &lo->slots[j]; + nbd_rollback_all (slot); + } + // PTB disable unsets the nabled flag and clears the queue + nbd_disable (lo); + for (j = 0; j < max_clrq_retries; j++) { + int m = nbd_clr_queue (lo); + if (m <= 0) + break; + } + // PTB this would unsign the device: lo->flags &= ~NBD_SIGNED; + + /* + * PTB put back invalidate buffers for use when called from + * clr_sock from nbd_release on request of Rogier Wolff. + */ + for (j = 0; j < lo->nslot; j++) { + invalidate_buffers (mk_kdev(major, (lo->nbd << NBD_SHIFT) + j)); + } + return 0; +} + +/* + * PTB - added a device/module reset for tidyness in face of rampant hacking + * - this does a soft_reset of all devices, followed bu a clr sock + * - on each, and then clears the kernel queue. It unsets the + * - enabled flag on each device. + * We have to be called without either the spinlock or the + * spinlock held, as we call soft_reset which takes both, as + * does clr_sock + */ +int +nbd_hard_reset (struct nbd_device *lo) +{ + int i; + int err = 0; + + for (i = 0; i < MAX_NBD; i++) { + struct nbd_device *lo = &nbd_dev[i]; + int j; + if (!lo->file || !lo->inode) + continue; + if (!(atomic_read(&lo->flags)&NBD_INITIALISED)) + continue; + nbd_soft_reset (lo); + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slot = &lo->slots[j]; + // PTB this takes the io spinlock and our spinlock. + nbd_clr_sock (slot); + } + // PTB - call clr_kernel_queue without the io_spinlock held + nbd_clr_kernel_queue (lo); + } + + return err; +} + +static int +indirect_ioctl_load (struct request *req, int cmd, char * buf) +{ + + int size; + int err; + struct nbd_ioctl *remote_ioctl = nbd_remote_ioctl.remote; + + if (!remote_ioctl) + return -EINVAL; + + size = remote_ioctl->size_user (cmd, buf); + + if (size < 0) { + // PTB unauthorized ioctl + err = -EINVAL; + goto error_out; + } + + if (size == 0) { + // PTB we never use the nbd devices small buffer now + req->nr_sectors = 0; + req->buffer = NULL; + return size; + } + + // PTB we have to use an extra buffer or else block + // here and rendezvous directly with the get_req call + req->nr_sectors = (size + 511) >> 9; + req->buffer = kmalloc(req->nr_sectors << 9, GFP_KERNEL); + + if (!req->buffer) { + err = -ENOMEM; + goto error_out; + } + + if (_IOC_DIR (cmd) & _IOC_WRITE) { + err = + remote_ioctl->cp_from_user (cmd, req->buffer, buf, size); + if (err < 0) { + kfree (req->buffer); + goto error_out; + } + } + return size; + +error_out: + req->buffer = NULL; + req->nr_sectors =0; + return err; +} + +static int +indirect_ioctl_store (struct request *req, int cmd, char * buf, + int size) +{ + int err; + struct nbd_ioctl * remote_ioctl = nbd_remote_ioctl.remote; + + if (!remote_ioctl) + return -EINVAL; + + if (size <= 0) + return size; + + // PTB if we are reading, it should be to the local buffer + // PTB the buffer points at a kmalloced area + + if (!req->buffer) + return -ENOMEM; + err = remote_ioctl->cp_to_user (cmd, buf, req->buffer, size); + kfree (req->buffer); + if (err < size) + return -ENOMEM; + return size; +} + +static int +do_nbd_remote_ioctl(struct nbd_device *lo, int minor, int cmd, unsigned long arg) { + + unsigned start_time, timeout; + size_t size; + int err; + struct request * req; + struct completion x; + + /* + * PTB here we have to treat remote ioctls. We should probably make + * a request and put it on the local queue, but where can we get + * the request from? We might have to keep one in reserve. + * That's not a bad idea, because + * we generate it here and we delete it here, and the daemon code + * is all set up to read that sort of thing. So that's what we do ... + */ + + timeout = lo->req_timeo * HZ; + start_time = jiffies; + + while (!(req = blk_get_request(lo->q,WRITE,0))) { + if (jiffies >= start_time + timeout) { + // PTB it takes too long + NBD_ALERT + ("took too long to get a spare ioctl req: TIMEOUT\n"); + return -ETIME; + } + err = interruptible_sleep_on_timeout (&lo->req_wq, + start_time + + timeout - jiffies); + } + + set_rq_type(req, IOCTL); + + req->errors = 0; + + // PTB this is the fixed-up command + req->special = (void *) cmd; + + /* + * PTB this is (arg if it is direct, else) the address of a local buffer + * PTB we need to store the arg or its dereference somewhere local + * for a while until the cnb-client thread can enter and pick it + * up. The alternative is to block the ioctl here until it is + * picked up, which IS possible. + */ + + if (_IOC_DIR (cmd) & _IOC_READ) { + // PTB indirect + size = indirect_ioctl_load (req, cmd, (char *)arg); + if (size < 0) { + goto end; + } + } else { + // PTB direct - we just need to remember the value + size = 0; + req->buffer = (char *) arg; + } + + // PTB point the request buffer vaguely in the direction of where + // the data is, but it does not matter. + req->rq_disk = lo->disk; + + // PTB we queue the request for treatment and wait till treated + init_completion(&x); + req->waiting = &x; + nbd_enqueue (lo, req); + + for (err = 0; err <= 0; err = wait_for_completion_timeout(&x, 1)) { + + /* + * PTB on slot or queue? Don't know. Only want + * to vamoosh it if its on queue, not slot + */ + struct list_head *pos; + int time_left = start_time + timeout - jiffies; + // PTB del_req will be run with queue_lock held + static void delete_req(void) { + + // PTB change countq only under this lock + if (! (req->flags & REQ_SPECIAL)) { + write_lock(&lo->altqueue_lock); + // PTB reverse inadvertent accounting in enqueue + atomic_dec (&lo->countq[rq_data_dir(req)]); + write_unlock(&lo->altqueue_lock); + } + + list_del_init (&req->queuelist); + + req->errors = -ETIME; + if (req->nr_sectors > 0 && req->buffer) { + kfree (req->buffer); + req->buffer = NULL; + } + }; + + if (time_left > 0) + continue; + + // PTB find req on list and delete it + write_lock (&lo->queue_lock); + list_for_each (pos, &lo->queue) { + + if (req != list_entry (pos, struct request, queuelist)) + continue; + + delete_req (); + write_unlock (&lo->queue_lock); + NBD_ALERT + ("took too long to treat queued ioctl: TIMEOUT\n"); + err = -ETIME; + goto end; + } + write_unlock (&lo->queue_lock); + + } // end while loop + + + if (_IOC_DIR (cmd) & _IOC_READ) { + err = indirect_ioctl_store(req, cmd, (char *)arg, size); + if (err < 0) { + goto end; + } + } + + if (req->errors != 0) { + err = req->errors; + err = err < 0 ? err : -EINVAL; + } else { + err = 0; + } +end: + blk_put_request(req); + return err; + +} + +static int +find_slot (struct nbd_device *lo, int pid) +{ + int i; + // go search + for (i = 0; i < NBD_MAXCONN; i++) { + struct nbd_slot * slot = &lo->slots[i]; + if (slot->pid == pid) + break; + } + if (i < NBD_MAXCONN) + return i; // found it + // not found + return -1; +} + +static int +fixup_slot (struct nbd_device *lo, int islot, unsigned int cmd, unsigned long *arg) +{ + int intval; + + switch (cmd) { + + // PTB get slot info from parameter if not given + case NBD_CLEAR_SOCK: + case MY_NBD_CLR_REQ: + case MY_NBD_ERR_REQ: + // see if we match a known slot pid + if (arg && *arg == 0) { + islot = find_slot (lo, current->pid); + if (islot >= 0) + return islot; + } + NBD_ALERT + ("failed to find slot for pid %d for ioctl %x arg %lx\n", + current->pid, cmd, *arg); + return islot = -1; + break; + + // PTB get the slot from the 16 high bits + case NBD_SET_SOCK: + case MY_NBD_SET_SPID: + intval = *arg >> ((sizeof (int) - sizeof (short)) * 8); + intval &= (1 << (sizeof (short) * 8)) - 1; + if (intval == 0) { + // no clue in the pid high bits. Search + islot = find_slot (lo, current->pid); + if (islot >= 0) { + // PTB change arg !! + *arg &= (1 << (sizeof (short) * 8)) - 1; + return islot; // found it + } + // not found + } + NBD_ALERT + ("failed to find slot for pid %d for ioctl %x arg %lx\n", + current->pid, cmd, *arg); + return islot = -1; + break; + + case MY_NBD_GET_REQ: + case MY_NBD_ACK: + islot = find_slot (lo, current->pid); + if (islot >= 0) + return islot; + NBD_ALERT + ("failed to find slot for pid %d for ioctl %x arg %lx\n", + current->pid, cmd, *arg); + return islot; + break; + + case MY_NBD_REG_BUF: + case MY_NBD_SET_SIG: + islot = find_slot (lo, current->pid); + if (islot >= 0) + return islot; + /* + * PTB Otherwise they passed a buffer + * and the slot number is in the first 4B + * We need some magic here for safety! + * set sig is the only call that really needs + * to send its pid! + */ + + intval = -1; + if (!arg || !*arg || get_user (intval, (int *) *arg) + || intval <= 0 + || intval > NBD_MAXCONN) { + NBD_ALERT + ("failed to find slot for pid %d ioctl %x arg %lx\n", + current->pid, cmd, *arg); + return islot = -1; + } + islot = intval - 1; + + // PTB CHANGE ARG !!!! + *arg += sizeof (int); + return islot; + break; + } + + return islot = -1; +} + +/* + * PTB - generic ioctl handling + */ +static int +nbd_ioctl (struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct nbd_device *lo + = NULL; // PTB device pointer + int minor = -1; // PTB minor on which we got the ioctl + int islot = -1; // PTB slot number 0, 1, ... + int nbd = -1; // PTB the count for the device group + struct nbd_slot *slot + = NULL; // PTB slot pointer + int err; + + if (!capable(CAP_SYS_ADMIN)) { + NBD_ERROR ("caller must be root.\n"); + return -EPERM; + } + if (!inode) { + NBD_ERROR ("given bad inode.\n"); + return -EINVAL; + } + if (major (inode->i_rdev) != major) { + NBD_ERROR ("pseudo-major %d != %d\n", + major (inode->i_rdev), major); + return -ENODEV; + } + minor = minor (inode->i_rdev); + nbd = minor >> NBD_SHIFT; + if (nbd >= MAX_NBD) { + NBD_ERROR ("tried to open too many devices, %d\n", minor); + return -ENODEV; + } + lo = &nbd_dev[nbd]; + lo->harderror = 0; + islot = minor % NBD_MAXCONN - 1; + + /* + * PTB fixup breakage >= 2.5.44 caused by not being allowed to talk to + * minors. We deduce the slot number from hints in the call. + * Or we match against the known pids. + */ + if (islot < 0) { + islot = fixup_slot(lo, islot, cmd, &arg); + } + if (islot >= 0) + slot = & lo->slots[islot]; + + + // PTB these are all always local ioctls + switch (cmd) { + int err; + int intval; + int do_reenable; + + case NBD_CLEAR_SOCK: + if (islot < 0) { + NBD_ALERT ("CLEAR_SOCK called on full device nd%s arg %lx\n", + lo->devnam, arg); + return -EINVAL; + } + err = nbd_clr_sock (slot); + return err; + + case NBD_SET_SOCK: + if (islot < 0) { + NBD_ALERT ("SET_SOCK called on full device nd%s arg %lx\n", + lo->devnam, arg); + return -EINVAL; + } + err = nbd_set_sock (slot, arg); + return err; + + case BLKBSZGET: + // PTB The kernel should intercept this + NBD_ALERT ("attempted get_blksize with BLKBSZGET\n"); + return -EINVAL; + + case NBD_GET_BLKSIZE: + if (!(atomic_read (&lo->flags) & NBD_BLKSIZED)) { + return -EINVAL; + } + err = put_user (lo->blksize, (long *) arg); + return err; + + case BLKBSZSET: + // PTB The kernel should have intercepted this + NBD_ALERT ("attempted set_blksize with BLKBSZSET\n"); + return -EINVAL; + + case NBD_SET_BLKSIZE: + if (!arg) + return -EINVAL; + intval = -1; + if (get_user (intval, (int *)arg)) + return -EFAULT; + if (intval == -1) { + NBD_ALERT ("BLKBSZSET got %d from user\n", intval); + } + err = nbd_set_blksize (lo, intval); + return err; + + case NBD_SET_SIZE: + err = nbd_set_size (lo, (__u64) arg); + return err; + + case NBD_SET_SECTORS: + err = nbd_set_size (lo, ((__u64) arg) << 9); + return err; + + case MY_NBD_SET_INTVL: /* WG */ + err = nbd_set_intvl (lo, arg); + return err; + + case MY_NBD_SET_SPID: + if (islot < 0) { + NBD_ALERT ("SET_SPID called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + err = nbd_set_spid (slot, arg); + return err; + + case MY_NBD_SET_BUFFERWR: + err = nbd_set_bufferwr (lo, arg); + return err; + + case MY_NBD_REG_BUF: /* PTB register your buffer per socket here */ + if (!arg) { + /* PTB serves as existence check for this ioctl */ + return 0; + } + if (islot < 0) { + NBD_ALERT ("REG_BUF called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + err = nbd_reg_buf (slot, (char *) arg); + return err; + + case MY_NBD_SET_SIG: + if (islot < 0) { + NBD_ALERT ("SET_SIG called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + err = nbd_set_sig (slot, (int *) arg); + return err; + + case MY_NBD_GET_REQ: + if (islot < 0) { + NBD_ALERT ("GET_REQ called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + if (arg < 4096) { + arg = (unsigned)slot->buffer; + if (!arg) + return -EINVAL; + } + err = nbd_get_req (slot); + return err; + + case MY_NBD_GET_NPORT: + err = nbd_get_nport (lo, (int *) arg); + return err; + + case MY_NBD_CLR_REQ: + if (islot < 0) { + NBD_ALERT ("CLR_REQ called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + nbd_rollback_all (slot); + return 0; + + case MY_NBD_ERR_REQ: + if (islot < 0) { + NBD_ALERT ("ERR_REQ called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + nbd_error_all (slot); + return 0; + + case MY_NBD_SYNC: + + // PTB maybe run the reenable function + do_reenable = 0; + write_lock(&lo->meta_lock); + if (lo->reenable_time != 0 + && time_before(lo->reenable_time,jiffies)) { + lo->reenable_time = 0; + do_reenable = 1; + } + write_unlock(&lo->meta_lock); + if (do_reenable) + nbd_reenable(lo); + + // PTB error too old reqs if show_errs set, else roll them back + nbd_rollback_old (lo); + + // PTB opportunity to calculate speed + nbd_set_speed (lo); + + return 0; + + case MY_NBD_ACK: + if (islot < 0) { + NBD_ALERT ("NBD_ACK called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + err = nbd_ack (slot); + return err; + + /* let this be compiled in always - it's useful. PTB */ + case NBD_PRINT_DEBUG: + NBD_INFO("device %d: hd = %p, tl = %p, in = %d, out = %d\n", + minor, + list_head (&lo->queue, struct request, queuelist), + list_tail (&lo->queue, struct request, queuelist), + atomic_read (&lo->requests_in[READ]) + + atomic_read (&lo->requests_in[WRITE]), + atomic_read (&lo->requests_out[READ]) + + atomic_read (&lo->requests_out[WRITE]) + ); + err = 0; + return err; + case NBD_HARD_RESET: /* PTB - debugging */ + err = nbd_hard_reset (lo); + return err; + + case NBD_RESET: /* PTB - debugging */ + err = nbd_soft_reset (lo); + // PTB we reenable in 5s + nbd_reenable_delay(lo, 5); + return err; + + case NBD_SET_MD5SUM: /* PTB - change to do/plead md5summing */ + if (arg) { + atomic_set_mask (NBD_MD5SUM, &lo->flags); + } else { + atomic_clear_mask (NBD_MD5SUM, &lo->flags); + } + err = 0; + return err; + + case MY_NBD_SET_SHOW_ERRS: /* PTB/WG - change show error status */ + if (arg) { + atomic_set_mask (NBD_SHOW_ERRS, &lo->flags); + } else { + atomic_clear_mask (NBD_SHOW_ERRS, &lo->flags); + } + return 0; + + case MY_NBD_SET_DIRECT: /* PTB - change o_direct status */ + if (arg) { + atomic_set_mask (NBD_DIRECT, &lo->flags); + } else { + atomic_clear_mask (NBD_DIRECT, &lo->flags); + } + return 0; + + case MY_NBD_INVALIDATE: + err = nbd_set_remote_invalid (lo, (int) arg); + return err; + + case NBD_SET_PF_MEMALLOC: + if (arg) { + current->flags |= PF_MEMALLOC; + } else { + current->flags &= ~PF_MEMALLOC; + } + return 0; + } // PTB endsw + + // PTB these are the standard ioctls, and we might get them from + // the other side + + switch (cmd) { + int err; + int intval; + + case BLKROSET: /* PTB - change ro status */ + if (get_user(intval, (int*)arg)) + return -EFAULT; + // PTB local flags + nbd_set_read_only(lo, intval); + return 0; + + case BLKROGET: + intval = nbd_read_only(lo); + return put_user(intval, (int*)arg); + + case BLKFLSBUF: + nbd_maybe_sync_sync (lo); // PTB normally fsync_dev + // PTB device likely has buffers or caches in kernel + invalidate_buffers (inode->i_rdev); +#ifndef NO_BUFFERED_WRITES + if (atomic_read (&lo->flags) & NBD_BUFFERWR) { + // PTB got this from rd.c + // PTB destroy buffers + __invalidate_buffers (inode->i_rdev, 1); + } +#endif /* NO_BUFFERED_WRITES */ + return 0; + + case HDIO_GETGEO: + if (!arg) { + return -EINVAL; + } else { + struct hd_geometry *geo = + (struct hd_geometry *) arg; + int sectors = nbd_sizes[nbd << NBD_SHIFT] << 1; + unsigned short c; + unsigned char h, s; + if (sectors < (1 << 22)) { + h = 4; + s = 16; + c = sectors >> 6; + } else { + h = 255; + s = 63; + c = (sectors / h) / s; + } + err = 0; + if ((err = put_user (c, &geo->cylinders), err < 0) + || (err = put_user (h, &geo->heads), err < 0) + || (err = put_user (s, &geo->sectors), err < 0) + || (err = put_user (h, &geo->start), err < 0)) { + return err; + } + } + return 0; + +#ifndef BLKMDNTFY +#define BLKMDNTFY _IOW(0x12,133,sizeof(int)) +#endif + case BLKMDNTFY: + NBD_INFO ("received BLKMDNTFY, am now in raid %x\n", + (unsigned) arg); + nbd_md.inc(&nbd_md); + return 0; + +#ifndef BLKMDUNTFY +#define BLKMDUNTFY _IOW(0x12,134,sizeof(int)) +#endif + case BLKMDUNTFY: + NBD_INFO ("received BLKMDUNTFY, now out of raid %x\n", + (unsigned) arg); + nbd_md.dec(&nbd_md); + return 0; + +#ifndef BLKMDRGTR +#define BLKMDRGTR _IOW(0x12,135,sizeof(unsigned long)) +#endif + case BLKMDRGTR: + nbd_md.reg(&nbd_md, (int(*)(kdev_t, int))arg); + return 0; + + } // PTB endsw + + if (nbd_remote_ioctl.remote != NULL) { + struct nbd_ioctl *remote_ioctl = nbd_remote_ioctl.remote; + + if (remote_ioctl->convert_inplace (&cmd) < 0) { + NBD_ALERT ("unauthorized ioctl %#x\n", cmd); + return -EINVAL; + } + + err = do_nbd_remote_ioctl(lo, minor, cmd, arg); + return err; + } + return -EINVAL; +} + +/* + * PTB - release the device. This happens when the last process closes + * or dies. + */ +static int +nbd_release (struct inode *inode, struct file *file) +{ + struct nbd_device *lo; + int dev; + int nbd; + int islot; + + if (!inode) { + NBD_ALERT ("null inode.\n"); + return -ENODEV; + } + dev = minor (inode->i_rdev); + nbd = dev >> NBD_SHIFT; + + if (nbd >= MAX_NBD) { + // PTB impossible + NBD_ALERT ("too many open devices.\n"); + return -ENODEV; + } + + lo = &nbd_dev[nbd]; + + islot = dev % NBD_MAXCONN - 1; + + // PTB it is a daemon closing the slot? + if (islot >= 0 || (islot = find_slot(lo, current->pid), islot >= 0)) { + struct nbd_slot *slot = &lo->slots[islot]; + --slot->refcnt; + if (slot->pid == current->pid) { + + nbd_clr_sock (slot); + NBD_ALERT ("(%d): erasing slot pid %d\n", islot, slot->pid); + slot->pid = 0; + if (slot->refcnt > 0) { + NBD_ALERT + ("slot owner process %d released slot nd%s%d while not last\n", + slot->pid, lo->devnam, islot + 1); + } + } + } + + /* POSSIBLE change socket here PTB */ + + atomic_dec (&lo->refcnt); + + // PTB invalidate buffers on last close if show_err set + if (atomic_read (&lo->refcnt) <= 0 || !module_is_live(THIS_MODULE)) { + struct nbd_seqno * seqno_out = &lo->seqno_out; + //invalidate_buffers (lo->inode->i_rdev); + if (atomic_read (&lo->flags) & NBD_SHOW_ERRS) { + invalidate_buffers (mk_kdev (major, nbd << NBD_SHIFT)); + } + // PTB in any case the daemons are dead! + lo->bufsiz = 0; + seqno_out->reset(seqno_out); + } + + if (file + && (file->f_flags & O_DIRECT) + // PTB we set this to show we made iobuf + && (file->f_flags & O_NOFOLLOW)) { + file->f_flags &= ~(O_DIRECT|O_NOFOLLOW); + } + + return 0; +} + +static int +nbd_media_changed(struct gendisk *disk) { + struct nbd_device *lo = disk->private_data; + if (!lo || lo->magic != NBD_DEV_MAGIC) + return 0; + NBD_ALERT("nbd_media_changed called on nd%s\n", lo->devnam); + return (atomic_read (&lo->flags) & NBD_VALIDATED) == 0; +} + +static int +nbd_revalidate(struct gendisk *disk) { + struct nbd_device *lo = disk->private_data; + unsigned long flags; + int err = -EINVAL; + + if (!lo || lo->magic != NBD_DEV_MAGIC){ + return -EINVAL; + } + // PTB reenable part + NBD_ALERT("revalidate called on nd%s\n", lo->devnam); + write_lock_irqsave (&lo->meta_lock, flags); + if (! (atomic_read (&lo->flags) & NBD_REMOTE_INVALID) + && (atomic_read (&lo->flags) & NBD_ENABLED)) { + atomic_set_mask (NBD_VALIDATED, &lo->flags); + err = 0; + } + write_unlock_irqrestore (&lo->meta_lock, flags); + + return err; +} + +static struct block_device_operations nbd_blkops = { + owner: THIS_MODULE, + open: nbd_open, + release: nbd_release, + ioctl: nbd_ioctl, + media_changed: nbd_media_changed, + revalidate_disk: nbd_revalidate, +}; + +static struct gendisk * +nbd_find (dev_t dev, int *part, void *data) +{ + struct nbd_device *lo = data; + if (!lo) + return NULL; + if (lo->magic != NBD_DEV_MAGIC) + return NULL; + if (!lo->disk) + return NULL; + if (part) + NBD_ALERT("nbd_find called with part = %#x\n", (unsigned)*part); + if (part && (*part < 0 || *part >= NBD_MAXCONN)) + return NULL; + return get_disk (lo->disk); +} + + +static int +nbd_set_disk (struct nbd_device *lo, unsigned first_minor, unsigned npart) +{ + struct gendisk * disk = lo->disk; + if (!disk) + lo->disk = disk = alloc_disk (npart); + if (disk) { + disk->major = major; + disk->first_minor = first_minor; + disk->fops = &nbd_blkops; + disk->private_data = lo; + disk->queue = lo->q; + sprintf (disk->disk_name, "nd%s", lo->devnam); + // have to set minors (or capacity) to 1 (0) to avoid check disk + set_capacity (disk, 0); + add_disk (disk); + blk_register_region(MKDEV(major, first_minor), + npart, THIS_MODULE, nbd_find, NULL, lo); + set_capacity (disk, lo->bytesize >> 9); + // we should rescan later. From userland? + return 0; + } + + NBD_ERROR ("Insufficient memory for partition structs\n"); + return -ENOMEM; +} + +/* + * Pavel - And here should be modules and kernel interface + * (Just smiley confuses emacs :-) + */ + + +static void +nbd_reset(struct nbd_device *lo, int i) { + + int j; + + if (i < 0 || i >= MAX_NBD) + return; + lo->magic = NBD_DEV_MAGIC; + strncpy (lo->devnam, device_letter (i), 4); + for (j = 0; j < NBD_MAXCONN; j++) { /* PTB */ + struct nbd_slot *slot = &lo->slots[j]; + slot->lo = lo; + slot->i = j; + INIT_LIST_HEAD (&slot->queue); + } + lo->blksize = 1024; /* PTB 132 */ + lo->logblksize = 10; /* PTB */ + lo->bytesize = 0x7fffffff00000; /* PTB 132 */ + lo->size = 0x7fffffff; /* PTB (bytesizes >> 10) */ + lo->sectors = 0xfffffffe; /* PTB sectors */ + lo->nbd = i; + lo->req_timeo = NBD_REQ_TIMEO; /* PTB default pulse intvl */ + lo->max_sectors = buf_sectors; + + lo->enable = nbd_enable; + lo->reset = nbd_reset; + lo->disable = nbd_disable; + lo->read_only = nbd_read_only; + lo->set_speed = nbd_set_speed; + lo->hard_reset = nbd_hard_reset; + lo->soft_reset = nbd_soft_reset; + lo->reenable_delay = nbd_reenable_delay; + + INIT_LIST_HEAD (&lo->queue); + INIT_LIST_HEAD (&lo->altqueue); + init_waitqueue_head (&lo->wq); + init_waitqueue_head (&lo->req_wq); + init_MUTEX(&lo->pid_sem); + rwlock_init (&lo->queue_lock); + rwlock_init (&lo->altqueue_lock); + rwlock_init (&lo->meta_lock); + for (j = 0; j < NBD_MAXCONN; j++) { + nbd_blksizes[i * NBD_MAXCONN + j] = lo->blksize; + nbd_bytesizes[i * NBD_MAXCONN + j] = lo->bytesize; + nbd_sizes[i * NBD_MAXCONN + j] = lo->size; + nbd_max_sectors[i * NBD_MAXCONN + j] = lo->max_sectors; + } + nbd_init_seqno(&lo->seqno_out); + nbd_init_speed(&lo->rspeed); + nbd_init_speed(&lo->wspeed); + nbd_init_speed(&lo->tspeed); + + // PTB queuue has alreay been initialized, or will be + lo->q = nbd_queue; + + if (md5sum) { + atomic_set_mask (NBD_MD5SUM, &lo->flags); + } + if (sync_intvl) { + atomic_set_mask (NBD_SYNC, &lo->flags); + } + if (show_errs) { + atomic_set_mask (NBD_SHOW_ERRS, &lo->flags); + } + if (direct) { + atomic_set_mask (NBD_DIRECT, &lo->flags); + } + if (buffer_writes) { + atomic_set_mask (NBD_BUFFERWR, &lo->flags); + } + if (merge_requests) { + atomic_set(&lo->merge_requests, merge_requests); + } +} + +#ifdef MODULE +MODULE_AUTHOR ("Peter T. Breuer, Andres Marin"); +MODULE_DESCRIPTION ("Enhanced Network Block Device " NBD_VERSION); +MODULE_LICENSE ("GPL"); +#endif /* MODULE */ + +// PTB we steal these from the queue struct at init +static merge_requests_fn *ll_merge_requests_fn; +static merge_request_fn *ll_front_merge_fn; +static merge_request_fn *ll_back_merge_fn; + +/* PTB - + * These functions are needed when the kernel does request merging in + * order to stop it making requests that are bigger than our buffer. + * + * To turn OFF merging (once these functions are in place), set + * merge_requests=0. + */ +static int +nbd_merge_requests_fn (request_queue_t * q, struct request *req, + struct request *req2) +{ + struct nbd_device *lo = req->rq_disk->private_data; + + if (!atomic_read(&lo->merge_requests)) + return 0; + + if (!ll_merge_requests_fn) + return 0; + + if (req->nr_sectors + req2->nr_sectors > lo->max_sectors) + return 0; + + if (req->nr_sectors + req2->nr_sectors > + ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9))) + return 0; + + return ll_merge_requests_fn (q, req, req2); +} +static int +nbd_front_merge_fn (request_queue_t * q, struct request *req, struct bio * bio) +{ + struct nbd_device *lo = req->rq_disk->private_data; + + if (!atomic_read(&lo->merge_requests)) + return 0; + + if (!ll_front_merge_fn) + return 0; + + if (req->nr_sectors > lo->max_sectors) + return 0; + + if (req->nr_sectors > ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9))) + return 0; + + return ll_front_merge_fn (q, req, bio); +} +static int +nbd_back_merge_fn (request_queue_t * q, struct request *req, + struct bio * bio) +{ + struct nbd_device *lo = req->rq_disk->private_data; + + if (!atomic_read(&lo->merge_requests)) + return 0; + + if (!ll_back_merge_fn) + return 0; + + if (req->nr_sectors > lo->max_sectors) + return 0; + + if (req->nr_sectors > + ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9))) return 0; + + return ll_back_merge_fn (q, req, bio); +} + +// PTB - and now to play with the sysctl interface ... +static struct ctl_table_header *nbd_table_header; +// the above was set by the register call of the root table +static ctl_table nbd_table[] = { + {1, "rahead", + &rahead, sizeof (int), 0644, NULL, &proc_dointvec}, + {2, "plug", + &plug, sizeof (int), 0644, NULL, &proc_dointvec}, + {3, "sync_intvl", + &sync_intvl, sizeof (int), 0644, NULL, &proc_dointvec}, + {4, "merge_requests", + &merge_requests, sizeof (int), 0644, NULL, &proc_dointvec}, + {5, "md5sum", + &md5sum, sizeof (int), 0644, NULL, &proc_dointvec}, + {8, "md5_on_threshold", + &md5_on_threshold, sizeof (int), 0644, NULL, &proc_dointvec}, + {9, "md5_off_threshold", + &md5_off_threshold, sizeof (int), 0644, NULL, &proc_dointvec}, + {0} +}; +static ctl_table nbd_dir_table[] = { + {6, "enbd", NULL, 0, 0555, nbd_table}, + {0} +}; +static ctl_table nbd_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, nbd_dir_table}, + {0} +}; + +#ifdef CONFIG_DEVFS_FS +static devfs_handle_t devfs_handle; +static devfs_handle_t devfs_handles[MAX_NBD]; +#endif + + +int __init +nbd_init (void) +{ + int i; + int err = 0; + struct proc_dir_entry *res; + + NBD_INFO ("Network Block Device originally by pavel@elf.mj.gts.cz\n"); + NBD_INFO ("Network Block Device port to 2.0 by ptb@it.uc3m.es\n"); + NBD_INFO ("Network Block Device move networking to user space by " + "amarin@it.uc3m.es\n"); + NBD_INFO ("Enhanced Network Block Device " NBD_VERSION " by " + "ptb@it.uc3m.es\n"); + + nbd_queue = kmalloc(sizeof(*nbd_queue), GFP_KERNEL); + if (!nbd_queue) + return -ENOMEM; + + for (i = 0; i < MAX_NBD; i++) { + struct nbd_device *lo = &nbd_dev[i]; + struct gendisk *disk = alloc_disk(NBD_MAXCONN); + memset (lo, 0, sizeof (*lo)); + if (disk) + lo->disk = disk; + } + + if (register_blkdev (major, "nbd", &nbd_blkops)) { + NBD_ERROR ("Unable to register major number %d for NBD\n", + major); + return -EIO; + } +#ifdef MODULE + NBD_INFO ("registered device at major %d\n", major); +#endif + + +// PTB - set up kernel queue struct with default methods + blk_init_queue (nbd_queue, do_nbd_request, &nbd_lock); + + blk_queue_max_sectors(nbd_queue, buf_sectors); /* max per request */ + +/* + * PTB - I think that put: + * - q->plug_device_fn = generic_plug_device (static ll_rw_blk) + * - q->plug_tq.routine = generic_unplug_device (static ll_rw_blk) + * - q->back_merge_fn = ll_back_merge_fn (static ll_rw_blk) + * - q->front_merge_fn = ll_front_merge_fn (static ll_rw_blk) + * - q->merge_requests_fn = ll_merge_requests_fn (static ll_rw_blk) + * - q->request_fn = do_nbd_request (param) + */ + +/* + * PTB - we have to do some more init magic in 2.4.*. This says that we + * - take all stuff off the kernel queue before processing it, so in + * - particular iti s OK for kernel to do merges with the queue head. + * blk_queue_headactive (nbd_queue, 0); + */ + +/* + * LA - moved the next #if higher; + * - kernel 2.2.* doesn't know about plug_device_fn + */ + + // PTB control merge attempts so we do not overflow our buffer + ll_merge_requests_fn = nbd_queue->merge_requests_fn; + ll_front_merge_fn = nbd_queue->front_merge_fn; + ll_back_merge_fn = nbd_queue->back_merge_fn; + +// JSA - Add this line because under >=2.4.1, merge optimizations are in flux +/* + * PTB - however it's not this which does damage, I believe. Data: plugging + * - simply has to be enabled in these kernels. Without it, requests just + * - sit on the kernel queue and never come off and into our request_fn. + * PTB - commented the ifdef again after talks with Jens Axboe. + * - Apparently plug_fn will disappear in 2.4.4 and merge functions are + * the only way to control merges, so they MUST be included. + */ + +/* + * PTB - The functions below just impose our own stricter size limit before + * - calling the defaults if all seems OK sizewise. + */ + nbd_queue->merge_requests_fn = &nbd_merge_requests_fn; + nbd_queue->front_merge_fn = &nbd_front_merge_fn; + nbd_queue->back_merge_fn = &nbd_back_merge_fn; + + nbd_init_md(&nbd_md); + nbd_init_ioctl_stub(&nbd_remote_ioctl); + + for (i = 0; i < MAX_NBD; i++) { + struct nbd_device *lo = &nbd_dev[i]; + nbd_reset(lo, i); + } + + /* + * PTB we do the disk and partition stuff after we have + * contact, when nbd_open is called for the first time? + */ + + res = create_proc_read_entry ("nbdinfo", 0, NULL, NULL, NULL); + if (!res) { + NBD_ALERT ("creation of proc entry failed\n"); + return -EINVAL; + } + // PTB additional write_proc entry in struct + nbd_init_proc(res); + + // PTB make the gendisk structs very late. + for (i = 0; i < MAX_NBD; i++) { + struct nbd_device *lo = &nbd_dev[i]; + nbd_set_disk(lo, i * NBD_MAXCONN, NBD_MAXCONN); + } + +#ifdef CONFIG_DEVFS_FS + + devfs_handle = devfs_mk_dir (NULL, "nd", NULL); + if (devfs_handle) { + for (i = 0; i < MAX_NBD; i++) { + struct nbd_device *lo = &nbd_dev[i]; + int j; + // PTB make the subdirectory "a","b" etc. + devfs_handles[i] = + devfs_mk_dir (devfs_handle, lo->devnam, NULL); + // PTB add the blk specials, "0","1" to NBD_MAXCONN-1 + if (!devfs_handles[i]) + continue; + for (j = 0; j < MAX_NBD; j++) { + char name[4]; + sprintf (name, "%u", j); + devfs_register(devfs_handles[i], name, + DEVFS_FL_DEFAULT, + major, i * NBD_MAXCONN + j, + S_IFBLK | S_IRUSR | S_IWUSR, + &nbd_blkops, NULL); + } + // PTB do the whole disk symlink .. + devfs_mk_symlink (devfs_handles[i], "disk", + DEVFS_FL_DEFAULT, "0", + NULL, NULL); + // PTB .. and the channel symlinks + for (j = 1; j < MAX_NBD; j++) { + char link[4]; + char name[8]; + sprintf (link, "%u", j); + sprintf (name, "chan%u", j); + devfs_mk_symlink (devfs_handles[i], + name, + DEVFS_FL_DEFAULT, + link, NULL, NULL); + } + } + } +#endif /* CONFIG_DEVFS_FS */ + + // PTB - sysctl interface + nbd_table_header = register_sysctl_table (nbd_root_table, 1); + + // PTB we have to wait for the open to complete init with inode val + + return err; +} + +void __exit +nbd_cleanup (void) +{ + int i; + + for (i = 0; i < MAX_NBD; i++) { + + struct nbd_device *lo = &nbd_dev[i]; + int j; + + if (!(atomic_read (&lo->flags) & NBD_INITIALISED)) + continue; + + NBD_INFO ("invalidating buffers on device nd%s%d-%d\n", + lo->devnam, 0, NBD_MAXCONN); + + for (j = 0; j < NBD_MAXCONN; j++) { + int minor = i * NBD_MAXCONN + j; + invalidate_buffers (mk_kdev (major, minor)); + } + + NBD_INFO ("destroying buffers on device nd%s%d-%d\n", + lo->devnam, 0, NBD_MAXCONN); + + for (j = 0; j < NBD_MAXCONN; j++) { + int minor = i * NBD_MAXCONN + j; + __invalidate_buffers (mk_kdev (major, minor), 1); + } + } + + unregister_sysctl_table (nbd_table_header); + +#ifdef CONFIG_DEVFS_FS + if (devfs_handle) { + for (i = 0; i < MAX_NBD; i++) { + int j; + if (!devfs_handles[i]) + continue; + for (j = 0; j < NBD_MAXCONN; j++) { + char s[3]; + s[0] = '0' + j; + s[1] = 0; + if (j >= 10) { + s[0] = '1'; + s[1] = '0' + (j - 10); + s[2] = 0; + } + devfs_remove("nd/%s/%u", device_letter(i), j); + if (j == 0) { + devfs_remove("nd/%s/disk", device_letter(i)); + } else { + devfs_remove("nd/%s/chan%u",device_letter(i),j); + } + } + devfs_remove("nd/%s", device_letter(i)); + } + devfs_remove("nd"); + } +#endif + + remove_proc_entry ("nbdinfo", &proc_root); + + for (i = 0; i < MAX_NBD; i++) { + struct nbd_device *lo = &nbd_dev[i]; + atomic_clear_mask (NBD_ENABLED, &lo->flags); + if (lo->disk) { + del_gendisk(lo->disk); + put_disk(lo->disk); + } + if (lo->blockmap) { + kfree (lo->blockmap); + lo->blockmap = NULL; + } + nbd_sync_sync (lo); + } + + blk_cleanup_queue (nbd_queue); + + if (unregister_blkdev (major, "nbd") != 0) { + NBD_ALERT ("cleanup_module failed\n"); + } else { + NBD_INFO ("module cleaned up.\n"); + } + kfree(nbd_queue); + +} + +module_init (nbd_init); +module_exit (nbd_cleanup); + +EXPORT_SYMBOL(nbd_remote_ioctl); + +/* Compile line: + + * gcc -O2 -D__KERNEL__ -DMODULE -DEXPORT_SYMTAB -xc -c enbd.c -o enbd.o + * + * (possibly with -DMODVERSIONS also). PTB + * (possibly with -I/usr/src/linux-x.y.z/include also). PTB + */ --- linux-2.5.64/drivers/block/enbd/enbd_ioctl.c.pre-enbd Mon Mar 24 18:55:25 2003 +++ linux-2.5.64/drivers/block/enbd/enbd_ioctl.c Tue Mar 25 15:45:03 2003 @@ -0,0 +1,405 @@ +#ifndef __KERNEL__ +#include +#include +#endif + +#include +#include +#include +#ifndef _CADDR_T +#define caddr_t char* +#endif +#include +#include +#include +#include +#include +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) +#endif +#include +#include +#include + + +/* + * This is the whitelist of remote ioctls - an entry here tells the + * driver that it's OK to send this ioctl out over the net, because we + * have the right info on it. + * + * "The right info" is what is on the right hand side of the table (a 0 + * stands for a repetition of the LHS info). We have to fixup something + * that a lot of kernel authors forgot to do or got worng - namely + * declare their ioctls in a way that conveys information about their + * intended mode of use (see iotcl.h in the kernel sources). + * + * We need all ioctls to be delared as either + * + * _IO(class,id) -- default. Means no args. The call is enough. + * _IOW(class,id,type) -- we write a value to kernel that is sizeof(type) + * _IOR(class,id,type) -- we read a value from kernel sizeof(type) + * _IOWR(class,id,type) -- ibid, but both ways + * + * The "R" bit is crucial because it lets us know that the data is + * _indirected_. I.e. it's an address of somewhere in userspace where + * we want to read data from or write data to. + * + * The "type" part should be the type of the indirected argument, NOT + * the type of its address! + * + * Kernel authors typically make two mistakes: + * + * 1) they write _IO instead of _IOR or IOWR, and hence forget the + * type info. Well, not telling me if the argument data is + * direct or indirectly accessible was already bad enough! + * 2) they get the type argument _wrong_ when they do remember to + * put it. They write "int *" instead of "int", for example, + * when the argument to the ioctl is a pointer to an integer. + * OK, so it's a natural mistake to make! But in that case the + * argument should be "int" so that the kernel macro picks up + * sizeof(int) instead of sizeof(int*). + * + * Those "errors" have to be repaired via this table. Wrong at left, + * corrected at right. A 0 for the new entry indicates that the old + * was alright. If there isn't an entry, the ioctl won't be treated. + * If the size info works out at the max for the field (2^14 - 1) + * then a extra table is consulted for size and copy methods. + */ + + +/* + * PTB the space before the final comma is important as the ## + * discards the preceding token when D is empty + */ +#define _NEW_IO_(B,C,D...) C(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IO(B,D...) _IO(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IOW(B,D...) _IOW(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IOR(B,D...) _IOR(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IOWR(B,D...) _IOWR(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IORS(B) _IOC(_IOC_READ,_IOC_TYPE(B), _IOC_NR(B), _IOC_SIZEMASK) +#define _NEW_IOWRS(B) _IOC(_IOC_READ|_IOC_WRITE,_IOC_TYPE(B), _IOC_NR(B), _IOC_SIZEMASK) + +static struct ioctl_conv ioctl_conv_tab[] = { +// fs.h + { BLKROSET, _NEW_IOW(BLKROSET,int), }, + { BLKROGET, _NEW_IOR(BLKROGET,int), }, +//#define BLKRRPART _IO(0x12,95) /* re-read partition table */ + { BLKRRPART, 0, }, + { BLKGETSIZE, _NEW_IOR(BLKGETSIZE,int), }, +//#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ + { BLKFLSBUF, 0, }, + { BLKRASET, _NEW_IOW(BLKRASET,int), }, + { BLKRAGET, _NEW_IOR(BLKRAGET,int), }, + { BLKFRASET, _NEW_IOW(BLKFRASET,int), }, + { BLKFRAGET, _NEW_IOR(BLKFRAGET,int), }, + { BLKSECTSET, _NEW_IOW(BLKSECTSET,int), }, + { BLKSECTGET, _NEW_IOR(BLKSECTGET,int), }, + { BLKSSZGET, _NEW_IOR(BLKSSZGET,int), }, +// fd.h + { FDCLRPRM, 0, }, + { FDSETPRM, _NEW_IOWR(FDSETPRM, struct floppy_struct), }, + { FDDEFPRM, _NEW_IOWR(FDDEFPRM, struct floppy_struct), }, + { FDGETPRM, _NEW_IOR(FDGETPRM, struct floppy_struct), }, + { FDMSGON, 0, }, + { FDMSGOFF, 0, }, + { FDFMTBEG, 0, }, + { FDFMTTRK, _NEW_IOWR(FDFMTTRK, struct format_descr), }, + { FDFMTEND, 0, }, + { FDSETEMSGTRESH, _NEW_IOW(FDSETEMSGTRESH, unsigned), }, + { FDFLUSH, 0, }, + { FDSETMAXERRS, _NEW_IOWR(FDSETMAXERRS, struct floppy_max_errors), }, + { FDGETMAXERRS, _NEW_IOR(FDGETMAXERRS, struct floppy_max_errors), }, + { FDGETDRVTYP, _NEW_IOR(FDGETDRVTYP, floppy_drive_name), }, // 16 bytes + { FDSETDRVPRM, _NEW_IOWR(FDSETDRVPRM, struct floppy_drive_params), }, + { FDGETDRVPRM, _NEW_IOR(FDGETDRVPRM, struct floppy_drive_params), }, + { FDGETDRVSTAT, _NEW_IOR(FDGETDRVSTAT, struct floppy_drive_struct), }, + { FDPOLLDRVSTAT, _NEW_IOR(FDPOLLDRVSTAT, struct floppy_drive_struct), }, + { FDRESET, 0, }, + { FDGETFDCSTAT, _NEW_IOR(FDGETFDCSTAT, struct floppy_fdc_state), }, + { FDWERRORCLR, 0, }, + { FDWERRORGET, _NEW_IOR(FDWERRORGET, struct floppy_write_errors), }, + { FDRAWCMD, _NEW_IOWR(FDRAWCMD, struct floppy_raw_cmd[1]) }, // FIXME linked list + { FDTWADDLE, 0, }, + { FDEJECT, 0, }, +// cdrom.h + { CDROMPAUSE, _NEW_IO(CDROMPAUSE), }, + { CDROMRESUME, _NEW_IO(CDROMRESUME), }, + { CDROMPLAYMSF, _NEW_IOR(CDROMPLAYMSF, struct cdrom_msf), }, + { CDROMPLAYTRKIND, _NEW_IOR(CDROMPLAYTRKIND, struct cdrom_ti), }, + { CDROMREADTOCHDR, _NEW_IOWR(CDROMREADTOCHDR, struct cdrom_tochdr), }, + { CDROMREADTOCENTRY, _NEW_IOWR(CDROMREADTOCENTRY, struct cdrom_tocentry), }, + { CDROMSTOP, _NEW_IO(CDROMSTOP), }, + { CDROMSTART, _NEW_IO(CDROMSTART), }, + { CDROMEJECT, _NEW_IO(CDROMEJECT), }, + { CDROMVOLCTRL, _NEW_IOR(CDROMVOLCTRL, struct cdrom_volctrl), }, + { CDROMSUBCHNL, _NEW_IOWR(CDROMSUBCHNL, struct cdrom_subchnl), }, + { CDROMREADMODE2, _NEW_IOR(CDROMREADMODE2, struct cdrom_read), }, // INDIRECT 2336B + { CDROMREADMODE1, _NEW_IOR(CDROMREADMODE1, struct cdrom_read), }, // INDIRECT 2048B + { CDROMREADAUDIO, _NEW_IOR(CDROMREADAUDIO, struct cdrom_read_audio), }, + { CDROMEJECT_SW, _NEW_IO(CDROMEJECT_SW), }, + { CDROMMULTISESSION, _NEW_IOWR(CDROMMULTISESSION, struct cdrom_multisession), }, + { CDROM_GET_MCN, _NEW_IOWR(CDROM_GET_MCN, struct cdrom_mcn), }, + { CDROMRESET, _NEW_IO(CDROMRESET), }, + { CDROMVOLREAD, _NEW_IOWR(CDROMVOLREAD, struct cdrom_volctrl), }, + { CDROMREADRAW, _NEW_IOR(CDROMREADRAW, struct cdrom_read), }, // INDIRECT 2352B + // aztcd.c optcd.c + { CDROMREADCOOKED, _NEW_IOR(CDROMREADCOOKED, struct cdrom_msf), }, // INDIRECT FIXME + { CDROMSEEK, _NEW_IOR(CDROMSEEK, struct cdrom_msf), }, + // scsi-cd.c + { CDROMPLAYBLK, _NEW_IOWR(CDROMPLAYBLK, struct cdrom_blk), }, + // optcd.c + { CDROMREADALL, _NEW_IOR(CDROMREADALL, char[2646]), }, + // ide-cd.c + { CDROMGETSPINDOWN, _NEW_IOWR(CDROMGETSPINDOWN, char), }, // one byte + { CDROMSETSPINDOWN, _NEW_IOWR(CDROMSETSPINDOWN, char), }, // one byte + // cdrom.c + { CDROMCLOSETRAY, _NEW_IO(CDROMCLOSETRAY), }, + { CDROM_SET_OPTIONS, _NEW_IOW(CDROM_SET_OPTIONS, int), }, + { CDROM_CLEAR_OPTIONS, _NEW_IOW(CDROM_CLEAR_OPTIONS, int), }, + { CDROM_SELECT_SPEED, _NEW_IOW(CDROM_SELECT_SPEED, int), }, // FIXME + { CDROM_SELECT_DISC, _NEW_IOW(CDROM_SELECT_DISC, int), }, + { CDROM_MEDIA_CHANGED, _NEW_IOW(CDROM_MEDIA_CHANGED, int), }, + { CDROM_DRIVE_STATUS, _NEW_IOW(CDROM_DRIVE_STATUS, int), }, + { CDROM_DISC_STATUS, _NEW_IO(CDROM_DISC_STATUS), }, + { CDROM_CHANGER_NSLOTS, _NEW_IO(CDROM_CHANGER_NSLOTS), }, + { CDROM_LOCKDOOR, _NEW_IOW(CDROM_LOCKDOOR, int), }, + { CDROM_DEBUG, _NEW_IOW(CDROM_DEBUG, int), }, + { CDROM_GET_CAPABILITY, _NEW_IO(CDROM_GET_CAPABILITY), }, + // sbpcd + { CDROMAUDIOBUFSIZ, _NEW_IOW(CDROMAUDIOBUFSIZ, int), }, + // dvd + { DVD_READ_STRUCT, _NEW_IOR(DVD_READ_STRUCT, dvd_struct), }, + { DVD_WRITE_STRUCT, _NEW_IOWR(DVD_WRITE_STRUCT, dvd_struct), }, + { DVD_AUTH, _NEW_IOWR(DVD_AUTH, dvd_authinfo), }, + { CDROM_SEND_PACKET, _NEW_IOR(CDROM_SEND_PACKET, struct cdrom_generic_command), }, + { CDROM_NEXT_WRITABLE, _NEW_IOWR(CDROM_NEXT_WRITABLE, long), }, + { CDROM_LAST_WRITTEN, _NEW_IOWR(CDROM_LAST_WRITTEN, long), }, + // PTB local test ioctls + { NBD_TEST_IOCTL1, 0, }, // write an int + { NBD_TEST_IOCTL2, 0, }, // read an int + { NBD_TEST_IOCTL3, 0, }, // write and read an int + { NBD_TEST_IOCTL4, 0, }, // read 256B + { NBD_TEST_IOCTL5, 0, }, // r/w 256B + { NBD_TEST_IOCTL6, _NEW_IORS(NBD_TEST_IOCTL6), }, // read special + { NBD_TEST_IOCTL7, _NEW_IORS(NBD_TEST_IOCTL7), }, // r/w special + // PTB we must terminate with a 0,0 entry. + {0 , 0, }, +}; + +/* + * This should be the table of special methods for certain ioctls. + * The "new" code is the real index. It will have a size count of + * _IOC_SIZEMASK but the rest of it should be meaningful. The size is + * gotten by dynamic lookup using the size() function. + */ +static struct ioctl_special ioctl_special_tab[] = { + // PTB last entry must be all zeros + { 0, NULL, NULL, NULL, NULL, }, +}; + + +static struct ioctl_conv * +ioctl_lookup_old (int ioctl) +{ + int i; + unsigned old; + if (ioctl == -1) + return NULL; + for (i = 0; old = ioctl_conv_tab[i].old, old; i++) { + if (old == ioctl) + return &ioctl_conv_tab[i]; + } + // PTB not there + return NULL; +} + +int +nbd_ioctl_convert (int ioctl) +{ + struct ioctl_conv *conv = ioctl_lookup_old (ioctl); + if (!conv) + // PTB not there + return -1; + return conv->new ? : ioctl; +} + +int +nbd_ioctl_convert_inplace(int *ioctl) { + + int new_ioctl; + if (!ioctl) + return -EINVAL; + new_ioctl = nbd_ioctl_convert(*ioctl); + if (new_ioctl == -1) + return -EINVAL; + *ioctl = new_ioctl; + return 0; +} + +static struct ioctl_conv * +ioctl_lookup_new (int ioctl) +{ + int i = 0; + unsigned old; + for (i = 0; old = ioctl_conv_tab[i].old, old; i++) { + unsigned new = ioctl_conv_tab[i].new; + if (new == ioctl || (new == 0 && old == ioctl)) + return &ioctl_conv_tab[i]; + } + // PTB not there + return NULL; +} + +int +nbd_ioctl_revert (int ioctl) +{ + struct ioctl_conv *conv = ioctl_lookup_new (ioctl); + if (!conv) + // PTB not there + return -1; + return conv->old; +} + +static struct ioctl_special * +ioctl_special_lookup_new (int ioctl) +{ + int i; + unsigned new; + for (i = 0; new = ioctl_special_tab[i].new, new; i++) { + if (new == ioctl) + return &ioctl_special_tab[i]; + } + // PTB not there + return NULL; +} + +int +nbd_ioctl_size (int cmd, char *arg) +{ + int size = _IOC_SIZE (cmd); + if (size == _IOC_SIZEMASK) { + // PTB special hadling required. + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->size (arg); + } + return size; +} + +int +nbd_ioctl_size_user (int cmd, char *arg) +{ + int size = _IOC_SIZE (cmd); + if (size == _IOC_SIZEMASK) { + // PTB special hadling required. + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->size_user (arg); + } + return size; +} + + +#ifdef __KERNEL__ +int +nbd_ioctl_copy_to_user (int cmd, char *arg, char *buf, int size) +{ + + if (_IOC_SIZE (cmd) == _IOC_SIZEMASK) { + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->ioctl_copy_to_user (arg, buf, size); + } + + if (_IOC_DIR (cmd) & _IOC_READ) { + // indirect + copy_to_user (arg, buf, size); + return size; + } + + return -1; +} + + + +int +nbd_ioctl_copy_from_user (int cmd, char *buf, char *arg, int size) +{ + + if (_IOC_SIZE (cmd) == _IOC_SIZEMASK) { + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->ioctl_copy_from_user (buf, arg, size); + } + + if (_IOC_DIR (cmd) & _IOC_READ) { + // indirect + copy_from_user (buf, arg, size); + return size; + } + + // direct + if (size > sizeof (arg)) { + return -1; + } + + memcpy (buf, &arg, size); + return size; +} + +static struct nbd_ioctl struct_ioctl = { + convert : nbd_ioctl_convert, + convert_inplace : nbd_ioctl_convert_inplace, + revert : nbd_ioctl_revert, + size : nbd_ioctl_size, + size_user : nbd_ioctl_size_user, + cp_to_user : nbd_ioctl_copy_to_user, + cp_from_user : nbd_ioctl_copy_from_user, +}; + +static int __init +nbd_ioctl_init (void) +{ + struct nbd_ioctl_stub * remote_ioctl = &nbd_remote_ioctl; + remote_ioctl->reg(remote_ioctl, &struct_ioctl); + return 0; +} + +static void __exit +nbd_ioctl_cleanup (void) { + struct nbd_ioctl_stub * remote_ioctl = &nbd_remote_ioctl; + remote_ioctl->unreg(remote_ioctl, &struct_ioctl); +} + +module_init (nbd_ioctl_init); +module_exit (nbd_ioctl_cleanup); + +int linux_version_code = LINUX_VERSION_CODE; + +#ifdef MODULE + #if LINUX_VERSION_CODE > KERNEL_VERSION(2,1,0) + MODULE_AUTHOR ("Peter T. Breuer"); + MODULE_DESCRIPTION ("Enhanced Network Block Device Remote Ioctl"); + #ifdef MODULE_LICENSE + MODULE_LICENSE("GPL"); + #endif + #endif +#endif /* MODULE */ + +#endif /* __KERNEL__ */ + + +/* +static +int ioctl_init(struct ioctl_conv *self, int old, int new) { + self->old = old; + self->new = new; + self->serialize = ioctl_serialize; + self->deserialize = ioctl_deserialize; + self->size = ioctl_size; +} +*/ --- linux-2.5.64/drivers/block/enbd/enbd_ioctl_stub.c.pre-enbd Mon Mar 24 18:55:25 2003 +++ linux-2.5.64/drivers/block/enbd/enbd_ioctl_stub.c Mon Mar 24 22:51:59 2003 @@ -0,0 +1,30 @@ + +#include +#include +#include +#include + + /* + * PTB this is the hook for the enbd_ioctl extra module + */ + static int register_remote_ioctl(struct nbd_ioctl_stub *remote_ioctl, struct nbd_ioctl *x) { + if (!remote_ioctl->remote) { + remote_ioctl->remote = x; + return 0; + } + return -EINVAL; + } + static int unregister_remote_ioctl(struct nbd_ioctl_stub *remote_ioctl, struct nbd_ioctl *x) { + if (remote_ioctl->remote != x) + return -EINVAL; + remote_ioctl->remote = NULL; + return 0; + } + +int nbd_init_ioctl_stub(struct nbd_ioctl_stub *remote_ioctl) { + memset(remote_ioctl, 0, sizeof(*remote_ioctl)); + remote_ioctl->reg = register_remote_ioctl; + remote_ioctl->unreg = unregister_remote_ioctl; + return 0; +} + --- linux-2.5.64/drivers/block/enbd/enbd_md.c.pre-enbd Mon Mar 24 18:55:25 2003 +++ linux-2.5.64/drivers/block/enbd/enbd_md.c Mon Mar 24 22:51:59 2003 @@ -0,0 +1,99 @@ +#include +#include + +/* + * PTB small driver wide support database for MDRGTR ioctl + */ + + + +#ifndef HOT_ADD_DISK + #define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#endif +#ifndef SET_DISK_FAULTY + #define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#endif + +static void +nbd_md_dec (struct nbd_md *md) { + down(&md->access_sem); + if (--md->count <= 0) + md->notify_fn = NULL; + up(&md->access_sem); +} +static void +nbd_md_inc (struct nbd_md *md) { + down(&md->access_sem); + md->count++; + up(&md->access_sem); +} +static void +nbd_md_reg (struct nbd_md *md, int(*fn)(kdev_t, int)) { + down(&md->access_sem); + if (!md->notify_fn) { + md->notify_fn = fn; + md->count++; + } + up(&md->access_sem); +} + +/* + * PTB tell md devices in which we are embedded that we are alright + * + * @lo the nbd device to tell them about + */ +static int +nbd_notify_md_devices (struct nbd_md *md, kdev_t nbd_dev) +{ + //kdev_t nbd_dev = mk_kdev (major, nbd << NBD_SHIFT); + int err; + + down (&md->access_sem); + if (md->count > 0 && md->notify_fn) { + NBD_ALERT ("adding %x:%x to raid devices via fn\n", + major(nbd_dev), minor(nbd_dev)); + err = md->notify_fn (nbd_dev, HOT_ADD_DISK); + if (err < 0) { + NBD_ALERT ("HOT_ADD to raid devices returned %d\n", + err); + } + } + up (&md->access_sem); + + return 0; +} + +static int +nbd_unnotify_md_devices (struct nbd_md *md, kdev_t nbd_dev) +{ + + // kdev_t nbd_dev = mk_kdev (major, lo->nbd << NBD_SHIFT); + int err; + + down (&md->access_sem); + if (md->count > 0 && md->notify_fn) { + NBD_ALERT ("erasing %x:%x from raid devices via fn\n", + major(nbd_dev), minor(nbd_dev)); + err = md->notify_fn (nbd_dev, SET_DISK_FAULTY); + if (err < 0) { + NBD_ALERT + ("SETFAULTY to raid devices returned %d\n", err); + } + } + up (&md->access_sem); + return 0; +} + +void +nbd_init_md(struct nbd_md *md) +{ + md->notify_fn = NULL; + md->count = 0; + init_MUTEX(&md->access_sem); + md->notify = nbd_notify_md_devices; + md->unnotify = nbd_unnotify_md_devices; + md->dec = nbd_md_dec; + md->inc = nbd_md_inc; + md->reg = nbd_md_reg; +} + --- linux-2.5.64/drivers/block/enbd/enbd_seqno.c.pre-enbd Mon Mar 24 18:55:25 2003 +++ linux-2.5.64/drivers/block/enbd/enbd_seqno.c Mon Mar 24 23:28:41 2003 @@ -0,0 +1,75 @@ +#include + + +#define _NBD_GENERATION (8*sizeof(int) - __REQ_NBDSEQNO) +/* + * PTB increment the devices seqno + * + * @lo the nbd device to increment the seqno of + */ +static void +seqno_inc(struct nbd_seqno *nseqno) +{ + if (nseqno->seqno < (1 << _NBD_GENERATION)) { + ++nseqno->seqno; + return; + } + // PTB next generation ! + nseqno->seqno = 0; + atomic_inc (&nseqno->seqno_gen); +} +static int +seqno_get (struct nbd_seqno *nseqno) +{ + return nseqno->seqno; +} +static void +seqno_reset (struct nbd_seqno *nseqno) +{ + nseqno->seqno = 0; + atomic_set(&nseqno->seqno_gen,0); +} +/* + * PTB convert a seqno number into one with an extra generation number + * in the msb, so that it can be compared with others. return the + * result. + * + * We add the current generation no. to small seqnos, and we add the + * previous generation no. to large seqnos. + * + * @lo the nbd device to look at + * @seqno the small sequence number to return the full seq number for + */ +static unsigned int +seqno_calc (struct nbd_seqno *nseqno, unsigned int seqno) +{ + unsigned int genno; + static unsigned int absdiff(unsigned int x, unsigned int y) { + if (x > y) { + return x - y; + } else { + return y - x; + } + }; + genno = atomic_read (&nseqno->seqno_gen); + if (absdiff(seqno,nseqno->seqno) < (1 << (_NBD_GENERATION - 1))) { + return seqno + (genno << _NBD_GENERATION); + } + if (seqno < nseqno->seqno) { + return seqno + ((genno + 1) << _NBD_GENERATION); + } + return seqno + ((genno - 1) << _NBD_GENERATION); +} + +void nbd_init_seqno (struct nbd_seqno *nseqno) { + + seqno_reset(nseqno); + + nseqno->inc = seqno_inc; + nseqno->get = seqno_get; + nseqno->reset = seqno_reset; + nseqno->calc = seqno_calc; +} + + + --- linux-2.5.64/drivers/block/enbd/enbd_speed.c.pre-enbd Mon Mar 24 18:55:25 2003 +++ linux-2.5.64/drivers/block/enbd/enbd_speed.c Mon Mar 24 23:28:50 2003 @@ -0,0 +1,64 @@ +#include +#include + +/* + * PTB - update speed counters (if at least 5s has passed) + * + * @spd the speed struct to update + */ +static void +spd_update (struct nbd_speed *spd, int distance) +{ + + // last time we measured + int lastjiffy = atomic_read (&spd->jiffy); + // jiffies since last time + int djiffy = jiffies - lastjiffy; + + // previous no we measured + int lastdist = atomic_read (&spd->distance); + // blocks since last time + int ddistance = distance - lastdist; + + // write every 5 second in time + if (djiffy > 5 * HZ) { + + // jiffies since first time + int tjiffy = jiffies - atomic_read (&spd->frstj); + + // max tot speed measured so far + int speedmax = atomic_read (&spd->speedmax); + + // last instantaneous speed we measured + int lastspeed = atomic_read (&spd->speed); + + // instantaneous read blocks/s + int speed = djiffy ? (ddistance * HZ) / djiffy : 0; + + // smoothed KB/s + int speedsmoothed = + (djiffy * speed + HZ * lastspeed) / (djiffy + HZ); + + // average speed to now in KB/s + int speedav = tjiffy ? (distance * HZ) / tjiffy : 0; + + // smoothing count for max + int speedhi = + (speedav > speedsmoothed) ? speedav : speedsmoothed; + + // doing settings + atomic_set (&spd->speed, speedsmoothed); + if (speedhi > speedmax) + atomic_set (&spd->speedmax, speedhi); + atomic_set (&spd->distance, distance); + atomic_set (&spd->speedav, speedav); + atomic_set (&spd->jiffy, jiffies); + } +} + +void +nbd_init_speed(struct nbd_speed *spd) { + memset(spd, 0, sizeof(*spd)); + spd->update = spd_update; +} + --- linux-2.5.64/drivers/block/enbd/enbd_proc.c.pre-enbd Tue Mar 25 15:16:33 2003 +++ linux-2.5.64/drivers/block/enbd/enbd_proc.c Mon Mar 24 22:52:00 2003 @@ -0,0 +1,1059 @@ + +#include +#include + +extern struct nbd_device * nbd_get(int i); + +static void +do_reset (int reset, int i) { + + void do_r (void) { + struct nbd_device *lo = nbd_get(i); + if (reset != 0) { + lo->reset (lo, i); + return; + }; + }; + if (i >= 0 && i < MAX_NBD) { + do_r (); + return; + } +} + +/* + * PTB This is just to get a nice limited width integer printout in proc! + * use endpos (<= 8) spaces at most. We serve from a static buffer size 16. + */ +char * +display (unsigned n, int endpos) +{ + // PTB use endpos (<= 8) spaces at most + static char buf[16]; + int units = 0; + int decimals = 0; + int decpos = endpos; + int wholepart = n, fractionpart = 0; + buf[endpos--] = 0; + // PTB find the right units to display. U or K or M or G. + while (n >= 1 << 10) { + decimals = n & ((1 << 10) - 1); + n >>= 10; + units++; + } + switch (units) { + case 0: + break; + case 1: + buf[endpos--] = 'K'; + break; + case 2: + buf[endpos--] = 'M'; + break; + case 3: + buf[endpos--] = 'G'; + break; + case 4: + buf[endpos--] = 'T'; + break; + } + // after this wholepart = n && fractionpart = decimals + fractionpart = wholepart & ((1 << (units * 10)) - 1); + wholepart >>= units * 10; + // PTB write the whole digits (something between 0 and 1023 inclusive) + if (n == 0) { + buf[endpos--] = '0'; + } else { + while (endpos >= 0 && n > 0) { + buf[endpos--] = '0' + n % 10; + n /= 10; + } + } + // PTB if there is space and cause, add decimal digits + if (endpos >= 1 && units > 0) { + int k = 0; + char unitchar = buf[--decpos]; + buf[decpos + k++] = '.'; + while (endpos >= k) { + int digit = (decimals * 10) >> 10; + buf[decpos + k++] = '0' + digit; + decimals -= (digit << 10) / 10; + decimals *= 10; + } + buf[decpos + k++] = unitchar; + buf[decpos + k] = 0; + } + // PTB report the start position + return buf + endpos + 1; +} + + +static void +set_generic (int x, int i, int X) +{ + void set_x (void) { + struct nbd_device *lo = nbd_get(i); + if (lo->magic != NBD_DEV_MAGIC) + return; + if (x != 0) { + atomic_set_mask (X, &lo->flags); + return; + }; + atomic_clear_mask (X, &lo->flags); + }; + + if (i >= 0 && i < MAX_NBD) { + set_x (); + return; + } + for (i = 0; i < MAX_NBD; i++) { + set_x (); + } +} + +static void +set_sync_intvl (int sync_intvl, int i) +{ + set_generic(sync_intvl, i, NBD_SYNC); +} + + +static void +set_show_errs (int show_errs, int i) +{ + set_generic(show_errs, i, NBD_SHOW_ERRS); +} + +static void +set_md5sum (int md5sum, int i) +{ + set_generic(md5sum, i, NBD_MD5SUM); +} + + +static void +set_enable (int enable, int i) +{ + void set_e (void) { + struct nbd_device *lo = nbd_get(i); + if (!lo || lo->magic != NBD_DEV_MAGIC) + return; + if (enable != 0) { + if (!(atomic_read (&lo->flags) & NBD_ENABLED)) { + lo->enable (lo); + return; + } + }; + lo->disable (lo); + }; + + if (i >= 0 && i < MAX_NBD) { + set_e (); + return; + } + for (i = 0; i < MAX_NBD; i++) { + set_e (); + } +} + +static void +set_direct (int direct, int i) +{ + set_generic(direct, i, NBD_DIRECT); +} + +#ifndef NO_BUFFERED_WRITES +static void +set_buffer_writes (int buffer_writes, int i) +{ + set_generic(buffer_writes, i, NBD_BUFFERWR); +} +#endif + +static void +set_merge_requests (int mr, int i) +{ + void set_mr (void) { + struct nbd_device *lo = nbd_get(i); + if (lo->magic != NBD_DEV_MAGIC) + return; + atomic_set (&lo->merge_requests, mr); + } + + if (i >= 0 && i < MAX_NBD) { + set_mr (); + return; + } + for (i = 0; i < MAX_NBD; i++) { + set_mr (); + } +} + +int +nbd_read_proc (char *buf, char **start, off_t offset, int len, int *eof, + void *data) +{ + +#ifndef MIN +#define MIN(x,y) ((x)<(y)?(x):(y)) +#endif + + const int limit = MIN (PAGE_SIZE, len) - 80; + static int i; + struct nbd_device *lo; + static int last; + static void *next_label; + static char *next_label_name; + static int total; + unsigned long flags; + + if (offset > 0 && !next_label) { + *eof = 1; + *start = buf; + return 0; + } + + if (offset <= 0) { + // PTB do static inits first time through + last = -1; + i = 0; + next_label = NULL; + next_label_name = NULL; + total = 0; + } + + // PTB start this bytecount + len = 0; + +#define NBD_PROC_LABEL(n) \ + next_label = &&label_##n; \ + next_label_name = "label_" #n; \ + if (len > limit) { \ + *start = (char *) (unsigned long) len; \ + total += len; \ + return len;\ + } \ + label_##n: + + for ( /* static init */ ; i < MAX_NBD; i++) { + + char *devnam; + + lo = nbd_get(i); + devnam = lo->devnam; + if (lo->nslot <= 0) { + next_label = NULL; + continue; + } + + // PTB computed goto next not-done + if (next_label) { + void *label = next_label; + next_label = NULL; + next_label_name = NULL; + len = 0; + goto *label; + } + + NBD_PROC_LABEL (1); + + if (last == i - 2) { + struct nbd_device * lo = nbd_get (i - 1); + char *prevdevnam = lo->devnam; + len += + sprintf (buf + len, "Device %s:\tClosed\n", + prevdevnam); + } + if (last < i - 2) { + struct nbd_device * llo = nbd_get (last + 1); + struct nbd_device * plo = nbd_get (i - 1); + char lastdevnam[3]; + char prevdevnam[3]; + strncpy (lastdevnam, llo->devnam, 3); + strncpy (prevdevnam, plo->devnam, 3); + len += + sprintf (buf + len, "Device %s-%s:\tClosed\n", + lastdevnam, prevdevnam); + } + + NBD_PROC_LABEL (2); + + len += + sprintf (buf + len, "Device %s:\tOpen " "\n", devnam); + + NBD_PROC_LABEL (3); + + len += sprintf (buf + len, + "[%s] State:\t%s%s%s%s%s%s%s%s%s%s%s%slast error %d, lives %d, bp %d\n", + devnam, atomic_read (&lo->flags) + & NBD_INITIALISED ? "" : "uninitialized, ", + atomic_read (&lo->flags) + & NBD_WRITE_NOCHK ? "noverify, " : + "verify, ", lo->read_only(lo) ? "ro, " : "rw, ", + atomic_read(&lo->merge_requests) ? "merge requests, " : "", +#ifndef NO_BUFFERED_WRITES + atomic_read (&lo->flags) + & NBD_BUFFERWR ? "buffer writes, " : "", +#else + "", +#endif /* NO_BUFFERED_WRITES */ + atomic_read (&lo->flags) + & NBD_ENABLED ? "enabled, " : "disabled, ", + atomic_read (&lo->flags) + & NBD_VALIDATED ? "validated, " : "", + atomic_read (&lo->flags) + & NBD_REMOTE_INVALID ? "remote invalid, " : "", + atomic_read (&lo->flags) + & NBD_SHOW_ERRS ? "show_errs, " : "", + atomic_read (&lo->flags) + & NBD_DIRECT ? "direct, " : "", + atomic_read (&lo->flags) + & NBD_SYNC ? "sync, " : "", + atomic_read (&lo->flags) + & NBD_MD5SUM ? "md5sum, " : "", + lo->harderror, + lo->lives - + ((atomic_read (&lo->flags) & NBD_ENABLED) ? + 1 : 0), 0 //atomic_read(&buffermem_pages) + ); + + NBD_PROC_LABEL (4); + + do { // PTB begin long do once block + int countq[2] = { 0, 0 }; + int cmd; + + struct list_head *pos; + + read_lock_irqsave (&lo->queue_lock, flags); + + list_for_each (pos, &lo->queue) { + struct request *req = + list_entry (pos, struct request, queuelist); + if (countq[READ] + countq[WRITE] > 1000) + break; + + cmd = rq_data_dir (req); + countq[cmd]++; + } + + read_unlock_irqrestore (&lo->queue_lock, flags); + + len += sprintf (buf + len, + "[%s] Queued:\t+%dR/%dW curr (check %dR/%dW) +%dR/%dW max\n", + devnam, + atomic_read (&lo->countq[READ]), + atomic_read (&lo->countq[WRITE]), + countq[READ], countq[WRITE], + atomic_read (&lo->maxq[READ]), + atomic_read (&lo->maxq[WRITE])); + } while (0); // PTB end long do once block + + NBD_PROC_LABEL (5); + + len += sprintf (buf + len, + "[%s] Buffersize:\t%d\t(sectors=%d, blocks=%d)\n", + devnam, lo->bufsiz, lo->max_sectors, + lo->max_sectors / (lo->blksize >> 9)); + len += + sprintf (buf + len, "[%s] Blocksize:\t%d\t(log=%d)\n", + devnam, lo->blksize, lo->logblksize); + len += + sprintf (buf + len, "[%s] Size:\t%luKB\n", devnam, + (unsigned long) (lo->bytesize >> 10)); + len += + sprintf (buf + len, "[%s] Blocks:\t%u\n", devnam, + lo->size >> (lo->logblksize - 10)); + + NBD_PROC_LABEL (6); + + len += + sprintf (buf + len, "[%s] Sockets:\t%d", devnam, + lo->nslot); + + NBD_PROC_LABEL (7); + + do { // PTB begin short do once block + int j; + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = &lo->slots[j]; + if (j != atomic_read (&lo->islot)) { + len += + sprintf (buf + len, "\t(%s)", + slotj->file ? "+" : "-"); + } else { + len += + sprintf (buf + len, "\t(%s)", + slotj->file ? "*" : "."); + } + } + } while (0); // PTB end short do once block + + len += sprintf (buf + len, "\n"); + + NBD_PROC_LABEL (8); + + len += sprintf (buf + len, "[%s] Requested:\t%s", devnam, + display (atomic_read + (&lo->requests_in[READ]) + + atomic_read (&lo->requests_in + [WRITE]), 7)); + + NBD_PROC_LABEL (9); + + do { // PTB begin short do once block + int j; + char buff[2][8]; + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%s)", + display (slotj->in, 5)); + } + strncpy (buff[0], + display (atomic_read + (&lo->requests_in[READ]), 6), 7); + strncpy (buff[1], + display (atomic_read + (&lo->requests_in[WRITE]), 6), + 7); + len += + sprintf (buf + len, "\t%sR/%sW", buff[0], + buff[1]); + lo->set_speed (lo); + len += sprintf (buf + len, "\tmax %d", + atomic_read (&lo->maxreqblks)); + } while (0); // PTB end short do once block + + len += sprintf (buf + len, "\n"); + len += sprintf (buf + len, "[%s] Despatched:\t%s", devnam, + display (atomic_read + (&lo->requests_out[READ]) + + atomic_read (&lo->requests_out + [WRITE]), 7)); + + NBD_PROC_LABEL (10); + + do { // PTB begin short do once block + int j; + char buff[2][8]; + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%s)", + display (slotj->out, 5)); + } + strncpy (buff[0], + display (atomic_read + (&lo->requests_out[READ]), 6), + 7); + strncpy (buff[1], + display (atomic_read + (&lo->requests_out[WRITE]), 6), + 7); + len += + sprintf (buf + len, "\t%sR/%sW", buff[0], + buff[1]); + len += + sprintf (buf + len, "\tmd5 %sW", + display (atomic_read + (&lo->wrequests_5to), 5)); + len += + sprintf (buf + len, " (%s eq,", + display (atomic_read + (&lo->wrequests_5so), 5)); + len += + sprintf (buf + len, " %s ne,", + display (atomic_read + (&lo->wrequests_5wo), 5)); + len += + sprintf (buf + len, " %s dn)", + display (atomic_read + (&lo->wrequests_5eo), 5)); + } while (0); // PTB end short do once block + + len += sprintf (buf + len, "\n"); + len += sprintf (buf + len, "[%s] Errored:\t%s", devnam, + display (atomic_read (&lo->requests_err), + 7)); + + NBD_PROC_LABEL (11); + + do { // PTB begin short do once block + int j; + char buff[2][8]; + int toterrs = 0; + + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%s)", + display (slotj->err, 5)); + toterrs += slotj->err; + } + strncpy (buff[0], display (toterrs, 6), 7); + strncpy (buff[1], + display (atomic_read (&lo->requests_err) - + toterrs, 6), 7); + len += + sprintf (buf + len, "\t%s+%s\n", buff[0], + buff[1]); + } while (0); // PTB end short do once block + + NBD_PROC_LABEL (12); + + do { // PTB begin long do once block + int pending_rblks = 0; /* PTB reads not reached the slots yet */ + int pending_wblks = 0; /* PTB writes not reached the slots yet */ + int blks = 0; + + read_lock_irqsave (&lo->queue_lock, flags); + + do { // PTB begin short do once block + struct list_head *pos; + + int count = 0; + struct request *req; + + list_for_each (pos, &lo->queue) { + req = + list_entry (pos, struct request, + queuelist); + if (count++ > 1000) + break; + blks = req->nr_sectors / lo->blksize; + if (blks > 0) { + switch (rq_data_dir (req)) { + case READ: + pending_rblks += + blks; + break; + case WRITE: + pending_wblks += + blks; + break; + } + } + } + } while (0); // PTB end short do once block + + read_unlock_irqrestore (&lo->queue_lock, flags); + len += + sprintf (buf + len, "[%s] Pending:\t%d", devnam, + atomic_read (&lo->requests_req[READ]) + + atomic_read (&lo->requests_req[WRITE])); + + do { // PTB begin short do once block + int j; + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = + &lo->slots[j]; + len += + sprintf (buf + len, "\t(%d)", + slotj->req); + } + } while (0); // PTB end short do once block + + len += sprintf (buf + len, + "\t%dR/%dW+%dR/%dW\n", + atomic_read (&lo->requests_req[READ]), + atomic_read (&lo->requests_req[WRITE]), + pending_rblks, pending_wblks); + + } while (0); // PTB end long do once block + + NBD_PROC_LABEL (13); + + do { // PTB begin long do once block + char buff[10][8]; + int shift = lo->logblksize; + + strncpy (buff[0], + display (atomic_read (&lo->wspeed.speed) + << shift, 5), 7); + strncpy (buff[1], + display (atomic_read (&lo->wspeed.speedav) + << shift, 5), 7); + strncpy (buff[2], + display (atomic_read + (&lo->wspeed.speedmax) << shift, + 5), 7); + + strncpy (buff[3], + display (atomic_read (&lo->rspeed.speed) + << shift, 5), 7); + strncpy (buff[4], + display (atomic_read (&lo->rspeed.speedav) + << shift, 5), 7); + strncpy (buff[5], + display (atomic_read + (&lo->rspeed.speedmax) << shift, + 5), 7); + + strncpy (buff[6], + display (atomic_read (&lo->tspeed.speed) + << shift, 5), 7); + strncpy (buff[7], + display (atomic_read (&lo->tspeed.speedav) + << shift, 5), 7); + strncpy (buff[8], + display (atomic_read + (&lo->tspeed.speedmax) << shift, + 5), 7); + + len += + sprintf (buf + len, "[%s] B/s now:", devnam); + len += + sprintf (buf + len, "\t%s\t(%sR+%sW)\n", buff[6], + buff[3], buff[0]); + len += + sprintf (buf + len, "[%s] B/s ave:", devnam); + len += + sprintf (buf + len, "\t%s\t(%sR+%sW)\n", buff[7], + buff[4], buff[1]); + len += + sprintf (buf + len, "[%s] B/s max:", devnam); + len += + sprintf (buf + len, "\t%s\t(%sR+%sW)\n", buff[8], + buff[5], buff[2]); + } while (0); // PTB end long do once block + + do { // PTB begin short do once block + int blks; + int tot_reqs = 0; + + len += + sprintf (buf + len, "[%s] Spectrum:", devnam); + for (blks = 0; + blks <= atomic_read (&lo->maxreqblks); blks++) { + tot_reqs += + atomic_read (&lo->req_in[READ][blks]) + + atomic_read (&lo->req_in[WRITE][blks]); + } + + for (blks = 0; + blks <= atomic_read (&lo->maxreqblks); blks++) { + int req_blks = + atomic_read (&lo->req_in[READ][blks]) + + atomic_read (&lo->req_in[WRITE][blks]); + int percent = + tot_reqs > + 0 ? (100 * req_blks) / tot_reqs : 0; + if (percent <= 0) + continue; + len += + sprintf (buf + len, "\t%u%%%d", percent, + blks); + } + len += sprintf (buf + len, "\n"); + } while (0); // PTB end short do once block + + NBD_PROC_LABEL (14); + + len += sprintf (buf + len, "[%s] Kthreads:\t%d", devnam, + atomic_read (&lo->kthreads)); + len += + sprintf (buf + len, "\t(%d waiting/%d running/%d max)\n", + atomic_read (&lo->kwaiters), + atomic_read (&lo->kthreads) - + atomic_read (&lo->kwaiters), + atomic_read (&lo->kmax)); + + NBD_PROC_LABEL (15); + + len += sprintf (buf + len, "[%s] Cthreads:\t%d", devnam, + atomic_read (&lo->cthreads)); + + NBD_PROC_LABEL (16); + + do { + int j; + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = &lo->slots[j]; + int state = + ((slotj->flags & NBD_SLOT_RUNNING) ? 1 : + 0) + + ((slotj->flags & NBD_SLOT_WAITING) ? 2 : + 0); + char *desc = "?"; + switch (state) { + case 0: + desc = "-"; + break; /* PTB not in */ + case 1: + desc = "*"; + break; /* PTB in and not waiting */ + case 2: + desc = "?"; + break; /* PTB impossible */ + case 3: + desc = "+"; + break; /* PTB in and waiting */ + } + len += sprintf (buf + len, "\t(%s)", desc); + } + } while (0); + + len += sprintf (buf + len, "\n"); + + NBD_PROC_LABEL (17); + + last = i; + len += sprintf (buf + len, "[%s] Cpids:\t%d", devnam, + atomic_read (&lo->cthreads)); + + do { + int j; + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%u)", slotj->pid); + } + len += sprintf (buf + len, "\n"); + } while (0); + + do { + int j, k; + for (j = 0; j < lo->nslot; j++) { + struct nbd_slot *slotj = &lo->slots[j]; + if (slotj->spid != 0) + break; + } + if (j < lo->nslot) { + len += + sprintf (buf + len, "[%s] Kpids:\t%d", + devnam, + atomic_read (&lo->cthreads)); + for (k = 0; k < lo->nslot; k++) { + struct nbd_slot *slotk = + &lo->slots[k]; + len += + sprintf (buf + len, "\t(%u)", + slotk->spid); + } + len += sprintf (buf + len, "\n"); + } + } while (0); + + NBD_PROC_LABEL (18); + + NBD_PROC_LABEL (19); + + // PTB have to tell loop head that we are not reentering + next_label = NULL; + next_label_name = NULL; + } + + NBD_PROC_LABEL (20); + + if (last == i - 2) { + struct nbd_device * lo = nbd_get (i - 1); + char *prevnam = lo->devnam; + len += + sprintf (buf + len, "Device %s:\tClosed\n", prevnam); + } + + if (last < i - 2) { + char lastnam[3]; + char prevnam[3]; + struct nbd_device * llo = nbd_get (last + 1); + struct nbd_device * plo = nbd_get (i - 1); + strncpy (lastnam, llo->devnam, 3); + strncpy (prevnam, plo->devnam, 3); + len += sprintf (buf + len, "Device %s-%s:\tClosed\n", + lastnam, prevnam); + } + + NBD_PROC_LABEL (21); + + // PTB re-init vital statistics for next time + next_label = NULL; + next_label_name = NULL; + + *eof = 1; + *start = buf; + total += len; + + return len; +} + +/* + * PTB read an int from a string. Return number of ints read (0 or 1). + */ +static int +sscani (char *buf, int len, int *n) +{ + + int i, a = 0; + short has_digits = 0; + short is_signed = 0; + + // PTB look for first significant character + for (i = 0; i < len; i++) { + char c = buf[i]; + if (c == ' ' || c == '\t') { + if (is_signed) + return 0; + } else if (c == '-') { + if (is_signed) + return 0; + is_signed = -1; + } else if (c == '+') { + if (is_signed) + return 0; + is_signed = 1; + } else if (c >= '0' && c <= '9') { + is_signed = 1; + has_digits = 1; + break; + } else { + return 0; + } + } + // PTB i now points at first digit if there is one + if (!has_digits) + return 0; + for (; i < len; i++) { + char c = buf[i]; + if (c < '0' || c > '9') + break; + a *= 10; + a += c - '0'; + } + if (is_signed >= 0) { + *n = a; + } else { + *n = -a; + } + return 1; +} + +/* + * look for a 1 or 2 letter device code ("a" or "aa") and save the + * device number to which it refers. Return number of device letter + * codes found (0 or 1). + */ +static int +sscana (char *buf, int len, int *n) +{ + + int i, a = 0; + short has_letters = 0; + + for (i = 0; i < len; i++) { + char c = buf[i]; + if (c >= 'a' && c <= 'z') { + has_letters = 1; + break; + } else if (c == ' ') { + if (has_letters) + return 0; + } else { + return 0; + } + } + if (!has_letters) + return 0; + for (; i < len; i++) { + char c = buf[i]; + if (c < 'a' || c > 'z') + break; + a *= 26; + a += c - 'a'; + } + *n = a; + return 1; +} + +/* + * read an integer (or 2-letter ascii) arg into an int. Return numner + * of integers read (0 or 1) and -1 for no keymatch. The first arg is a + * preceding key. + * @i is the integer value that results + * @j is an index if one one supplied (foo[j] = i ), else -1 + */ +static int +getarg (const char *buffer, int buflen, const char *key, int *i, int *j) +{ + + int keylen; + + void skip_ws (void) { + while (buflen > 0) { + if (*buffer != ' ' && *buffer != '\t') + break; + buffer++; + buflen--; + } + }; + + skip_ws (); + + keylen = strlen (key); + if (strncmp (buffer, key, keylen)) + return -1; + + buffer += keylen; + buflen -= keylen; + + skip_ws (); + + *j = -1; + if (*buffer == '[') { + char *closing; + int indexlen; + + buffer++; + buflen--; + + skip_ws (); + + closing = strchr (buffer, ']'); + if (!closing) + return -1; + indexlen = closing - buffer; + *closing = 0; + + if (sscani ((char *) buffer, indexlen, j) < 1) + return 0; + if (sscana ((char *) buffer, buflen, j) < 1) + return 0; + + buffer = closing; + buflen -= indexlen; + + buffer++; + buflen--; + + skip_ws (); + } + + if (*buffer != '=') + return -1; + + buffer++; + buflen--; + + skip_ws (); + + if (sscani ((char *) buffer, buflen, i) < 1) + return 0; + if (sscana ((char *) buffer, buflen, i) < 1) + return 0; + return 1; +} + +/* + * PTB - write a 0 with echo -n 0 to /proc/nbdinfo to do a hard reset. + */ +static int +nbd_write_proc (struct file *file, const char *buffer, unsigned long count, + void *data) +{ + + switch (count) { + + int i; + + case 2: + if (buffer[1] != '\n') + break; + /* else fallthru to case 1 */ + case 1: + switch (*buffer) { + case '1': + for (i = 0; i < MAX_NBD; i++) { + struct nbd_device *lo = nbd_get(i); + lo->hard_reset (lo); + } + break; + case '0': + for (i = 0; i < MAX_NBD; i++) { + // PTB this takes the io spinlock and our spinlock. + struct nbd_device *lo = nbd_get(i); + lo->soft_reset (lo); + lo->reenable_delay(lo, 5); + } + break; + } + break; + default: + do { + int index; + int merge_requests; + int sync_intvl; + int show_errs; + int md5sum; +#ifndef NO_BUFFERED_WRITES + int buffer_writes; +#endif + int enable; + int direct; + int reset; + + if (getarg (buffer, count, "merge_requests", + &merge_requests, &index) >= 0) { + // merge_requests + set_merge_requests (merge_requests, index); + break; + } + if (getarg (buffer, count, "sync_intvl", + &sync_intvl, &index) >= 0 + || getarg (buffer, count, "sync", + &sync_intvl, &index) >= 0) { + // sync_intvl + set_sync_intvl (sync_intvl, index); + break; + } + if (getarg (buffer, count, "show_errs", + &show_errs, &index) >= 0) { + // show_errs + set_show_errs (show_errs, index); + break; + } + if (getarg (buffer, count, "md5sum", + &md5sum, &index) >= 0) { + // md5sum + set_md5sum (md5sum, index); + break; + } +#ifndef NO_BUFFERED_WRITES + if (getarg (buffer, count, "buffer_writes", + &buffer_writes, &index) >= 0) { + // buffer_writes + set_buffer_writes (buffer_writes, index); + break; + } +#endif /* NO_BUFFERED_WRITES */ + if (getarg (buffer, count, "enable", + &enable, &index) >= 0) { + // enable + set_enable (enable, index); + break; + } + if (getarg (buffer, count, "direct", + &direct, &index) >= 0) { + // enable + set_direct(direct, index); + break; + } + if (getarg (buffer, count, "reset", + &reset, &index) >= 0) { + // reset + do_reset(reset, index); + break; + } + NBD_ERROR ("illegal %ld character command\n", + count); + return -EINVAL; + } while (0); + break; + } + return count; +} + +void +nbd_init_proc(struct proc_dir_entry *res) { + res->read_proc = nbd_read_proc; + res->write_proc = nbd_write_proc; +} + --- linux-2.5.64/drivers/Makefile.pre-enbd Wed Mar 5 04:29:33 2003 +++ linux-2.5.64/drivers/Makefile Mon Mar 24 17:44:04 2003 @@ -30,6 +30,7 @@ obj-$(CONFIG_SBUS) += sbus/ obj-$(CONFIG_ZORRO) += zorro/ obj-$(CONFIG_MAC) += macintosh/ +obj-$(CONFIG_ENBD) += block/enbd/ obj-$(CONFIG_PARIDE) += block/paride/ obj-$(CONFIG_TC) += tc/ obj-$(CONFIG_USB) += usb/ --- linux-2.5.64/include/linux/enbd.h.pre-enbd Tue Mar 25 15:20:38 2003 +++ linux-2.5.64/include/linux/enbd.h Tue Mar 25 14:17:37 2003 @@ -0,0 +1,517 @@ +#ifndef LINUX_ENBD_H +#define LINUX_ENBD_H + +/* unsigned comments are Pavel's originals for 2.1.* + * pavel@atrey.karlin.mff.cuni.cz (Pavel Machek) + * comments marked PTB are from + * ptb@it.uc3m.es (Peter T. Breuer) + * comments marked AMARIN are from + * amarin@it.uc3m.es (Andres Marin Lopez) + */ + +#include + +#ifndef NBD_VERSION +#define NBD_VERSION "2.4.30 $Date$" +#endif /*NBD_VERSION*/ + + /* + * Third type of request apart from READ or WRITE + */ + #ifndef IOCTL + # define IOCTL 2 + #endif + /* + * and fourth .. + */ + #ifndef MD5SUM + # define MD5SUM 3 + #endif + /* + * and fifth .. + */ + #ifndef SPECIAL + # define SPECIAL 4 + #endif + + /* + * We need extra bits of req->flags + * */ + # define __REQ_NBD __REQ_NR_BITS + # define REQ_NBD (1 << __REQ_NBD) + # define __REQ_NBDSEQNO (__REQ_NR_BITS + 1) + # define REQ_NBDSEQNO (1 << __REQ_NBDSEQNO) + // PTB ... and all the other bits are seqno too! + +/* PTB - new style ioctl assignments */ + #define NBD_SET_SOCK _IOW(0xab, 0x00, int) + #define NBD_TEST_IOCTL1 _IOW(0xab, 0x01, int) + #define NBD_SET_SIZE _IOW(0xab, 0x02, int) + #define NBD_DO_IT _IOW(0xab, 0x03, int) + #define NBD_CLEAR_SOCK _IOW(0xab, 0x04, int) + #define NBD_CLEAR_QUE _IO (0xab, 0x05) + #define NBD_PRINT_DEBUG _IO (0xab, 0x06) + #define NBD_TEST_IOCTL2 _IOR(0xab, 0x07, int) + #define NBD_HARD_RESET _IO (0xab, 0x09) + #define NBD_DEC_USE_COUNT _IO (0xab, 0x09) + #define MY_NBD_ACK _IOW(0xab, 0x0a, char *) + #define MY_NBD_GET_REQ _IOW(0xab, 0x0b, char *) + #define MY_NBD_REG_BUF _IOW(0xab, 0x0c, char *) + #define MY_NBD_CLR_REQ _IOW(0xab, 0x0d, int) + #define MY_NBD_SYNC _IOW(0xab, 0x0e, int) + #define NBD_SET_SECTORS _IOW(0xab, 0x0f, int) + #define MY_NBD_SET_SIG _IOW(0xab, 0x10, int *) + #define NBD_RESET _IO (0xab, 0x11) + #define NBD_TEST_IOCTL3 _IOWR(0xab, 0x12, int) + #define MY_NBD_ERR_REQ _IOW(0xab, 0x13, int) + #define MY_NBD_SET_INTVL _IOW(0xab, 0x14, int) + #define MY_NBD_SET_SHOW_ERRS _IOW(0xab, 0x15, int) + #define NBD_SET_MD5SUM _IOW(0xab, 0x16, int) + #define MY_NBD_SET_BUFFERWR _IOW(0xab, 0x17, int) + #define MY_NBD_INVALIDATE _IOW(0xab, 0x18, int) + #define MY_NBD_SET_SPID _IOW(0xab, 0x19, int) + #define MY_NBD_SET_RQ_HANDLE _IOW(0xab, 0x1a, void*) + #define MY_NBD_SET_RQ_SEQNO _IOW(0xab, 0x1b, int) + #define MY_NBD_SET_RQ_DIGEST _IOWR(0xab, 0x1d, nbd_digest_t) + #define NBD_TEST_IOCTL4 _IOR(0xab, 0x1e, char[256]) + #define NBD_TEST_IOCTL5 _IOWR(0xab, 0x1f, char[256]) + #define NBD_TEST_IOCTL6 _IO(0xab, 0x20) // special r 256B + #define NBD_TEST_IOCTL7 _IO(0xab, 0x21) // special rw 256B + #define NBD_SET_BLKSIZE _IOW(0xab, 0x22, int) + #define NBD_GET_BLKSIZE _IOR(0xab, 0x23, long) + #define NBD_SET_PF_MEMALLOC _IOW(0xab, 0x24, int) + #define MY_NBD_SET_DIRECT _IOW(0xab, 0x25, int) + #define MY_NBD_GET_NPORT _IOR(0xab, 0x26, int) + +#define MAX_NBD 16 /* PTB MAX was 128, but that's a lot */ +#define NBD_SHIFT 4 /* PTB 16 partitions/sockets/slots per device */ + /* PTB number of socket slots per device */ +#define NBD_MAXCONN (1< + + #define ENDREQ_NOCURRENT + #define LOCAL_END_REQUEST + #include + + + /* PTB various defaults */ + #define NBD_RAHEAD_DFLT 24 /* PTB slow medium */ + #define NBD_SYNC_INTVL 0 /* PTB sync every nK reqs (default disable) */ + #define NBD_REQ_TIMEO 5 /* PTB client inactivity chk intvl (rollback) */ + #define NBD_SPEED_LIM 100000 /* PTB limit to 100M write reqs/s */ + #define NBD_MERGE_REQ_DFLT 0 /* PTB until accounting fixed! */ + /* PTB Jens Axboe says that plug should always be set in 2.4.* */ + #define NBD_PLUG_DFLT 1 + #define NBD_MD5SUM_DFLT 0 + +/* + * PTB User messaging defs. + */ + + #define NBD_ID "NBD #%d[%d]: %s " + + #define NBD_DEBUG(level, s...) \ + { static int icnt; printk( KERN_DEBUG NBD_ID, __LINE__, icnt++, __FUNCTION__); printk(s);} + #define NBD_ERROR( s...) \ + { static int icnt; printk( KERN_ERR NBD_ID, __LINE__, icnt++, __FUNCTION__); printk(s);} + #define NBD_ALERT( s...) \ + { static int icnt; printk( KERN_ALERT NBD_ID, __LINE__, icnt++, __FUNCTION__); printk(s);} + #define NBD_INFO( s...) \ + { static int icnt; printk( KERN_INFO NBD_ID, __LINE__, icnt++, __FUNCTION__); printk(s);} + + + + struct nbd_slot { + struct file * file; /* PTB add - for refcnt, NULL if slot empty */ + struct socket * sock; /* PTB add */ + int in; /* PTB add - tot blocks entered */ + int out; /* PTB add - tot blocks released */ + int err; /* PTB add - tot blocks errored */ + int req; /* PTB add - tot blocks pending */ + char * buffer; /* PTB add - user space buffer */ + int bufsiz; /* PTB add - user space buffer size */ + struct list_head queue; + unsigned long req_age; /* PTB add - age of pending req */ + unsigned long cli_age; /* PTB add - age of client */ + struct nbd_device *lo; /* PTB add - parent device */ + #define NBD_SLOT_RUNNING 0x0001 + #define NBD_SLOT_WAITING 0x0002 + #define NBD_SLOT_BUFFERED 0x0004 + #define NBD_SLOT_MD5SUM 0x8000 /* slot reply has a digest in it ..*/ + #define NBD_SLOT_MD5_OK 0x10000 /* .. and equaled req's */ + int flags; /* PTB add */ + int i; /* PTB add - slot number */ + int buflen; /* PTB add - buffer byte count */ + int pid; /* PTB add - client process */ + int refcnt; /* PTB add - so can set_sock/clr_sock ourself */ + int nerrs; /* PTB add - local error count */ + int spid; /* PTB add - server pid */ + }; + + struct nbd_md; + struct nbd_md { + int count; + struct semaphore access_sem; + int (*notify_fn)(kdev_t, int); + int (*notify)(struct nbd_md *,kdev_t); + int (*unnotify)(struct nbd_md *,kdev_t); + void (*dec)(struct nbd_md *); + void (*inc)(struct nbd_md *); + void (*reg)(struct nbd_md *, int(*)(kdev_t, int)); + }; + + struct nbd_speed { + atomic_t speed; /* PTB add - current speed in KB/s */ + atomic_t speedmax; /* PTB add - max speed */ + atomic_t speedav; /* PTB add - average speed */ + atomic_t distance; /* PTB add - last distance measure */ + atomic_t jiffy; /* PTB add - last jiffies speed set */ + atomic_t frstj; /* PTB add - first jiffies */ + void (*update)(struct nbd_speed*, int); + }; + + struct nbd_md_list { + struct list_head list; + kdev_t dev; + }; + + struct nbd_seqno; // forward decl + struct nbd_seqno { + unsigned int seqno; /* PTB add - sequence number */ + atomic_t seqno_gen; /* PTB add - seqno genration */ + void (*inc)(struct nbd_seqno *); + int (*get)(struct nbd_seqno *); + void (*reset)(struct nbd_seqno *); + unsigned (*calc)(struct nbd_seqno *, unsigned); + }; + + struct nbd_device { + atomic_t refcnt; + + #define NBD_READ_ONLY 0x0001 + #define NBD_WRITE_NOCHK 0x0002 + #define NBD_INITIALISED 0x0004 + #define NBD_SIGNED 0x0008 + + #define NBD_ENABLED 0x0010 + #define NBD_SIZED 0x0020 + #define NBD_BLKSIZED 0x0040 + + #define NBD_QBLOCKED 0x0100 + #define NBD_SHOW_ERRS 0x0200 + #define NBD_SYNC 0x0400 + #define NBD_VALIDATED 0x0800 /* read partition table */ + + #define NBD_BUFFERWR 0x1000 /* buffer writes to device */ + #define NBD_REMOTE_INVALID \ + 0x2000 /* remote resource vanished */ + #define NBD_DIRECT 0x4000 /* convert opens to O_DIRECT */ + #define NBD_MD5SUM 0x8000 + + + atomic_t flags; + int harderror; /* Code of hard error */ + int magic; /* FIXME: not if debugging is off */ + struct list_head queue; + rwlock_t queue_lock; /* PTB add - spinlock */ + int nslot; /* PTB add - total slots */ + atomic_t islot; /* PTB add - current slot */ + int aslot; /* PTB add - total active slots*/ + atomic_t requests_in[2]; /* PTB add - blocks put on queue */ + atomic_t requests_out[2]; /* PTB add - blocks out from queue */ + atomic_t requests_err; /* PTB add - blocks erred on queue */ + atomic_t wrequests_5so; /* PTB add - write blocks md5 skip */ + atomic_t wrequests_5wo; /* PTB add - write blocks md5 wr */ + atomic_t wrequests_5eo; /* PTB add - write blocks md5 refus*/ + atomic_t wrequests_5to; /* PTB add - write blocks md5sum */ + atomic_t wrequests_5co; /* PTB add - write blocks md5 tot */ + atomic_t wrequests_5no; /* PTB add - write blocks not md5 */ + atomic_t requests_req[2]; /* PTB add - read blocks pending */ + atomic_t kwaiters; /* PTB add - kernel thrds waiting */ + atomic_t kthreads; /* PTB add - kernel threads in */ + atomic_t maxq[2]; /* PTB add - max req queue depth */ + atomic_t countq[2]; /* PTB add - request queue depth */ + atomic_t errors; /* PTB add - tot requests errored */ + struct nbd_seqno seqno_out; /* PTB add - seq number */ + atomic_t cwaiters; /* PTB add - client thrds waiting */ + atomic_t cthreads; /* PTB add - client threads in */ + atomic_t req_in[2][1 + NBD_MAX_SECTORS/2]; + wait_queue_head_t wq; /* PTB add */ + struct nbd_slot slots[NBD_MAXCONN]; /* PTB add - client array */ + unsigned blksize; /* PTB add - device blksize in B */ + u64 bytesize; /* PTB add - device size in B */ + u64 sectors; /* PTB add - device size (sectors) */ + unsigned size; /* PTB add - device size in blks */ + unsigned logblksize; /* PTB add - log2 blksize */ + unsigned nbd; /* PTB add - this array index */ + int signature[NBD_SIGLEN/sizeof(int)]; + /* PTB add - server sig */ + struct file * file; /* PTB add - for ref */ + struct inode * inode; /* PTB add - for ref */ + int bufsiz; /* PTB add - userspace buffer size */ + atomic_t kmax; /* PTB add - max kernel threads */ + char *blockmap; /* PTB add - map of block states */ + unsigned long disabled; /* PTB add - when was it disabled */ + int req_timeo; /* PTB add - inactivity timeout */ + struct timer_list run_queue; /* PTB add - run queue */ + struct work_struct task_queue; /* PTB add - task queue */ + char devnam[4]; /* PTB add - drive letters */ + atomic_t maxreqblks; /* PTB add - maximum req size seen */ + int max_sectors; /* PTB add - max req size allowed! */ + int lives; /* PTB add - # times enabled */ + // PTB speed measurement settings + struct nbd_speed tspeed; + struct nbd_speed wspeed; + struct nbd_speed rspeed; + int dummy; /* PTB add - unused */ + struct request *req; /* PTB fake request for ioctls */ + wait_queue_head_t req_wq; /* PTB req done notifications */ + struct request *rq; /* PTB special request ptr */ + struct list_head altqueue; /* PTB diverted requests */ + rwlock_t altqueue_lock; /* PTB add - diverted reqs lock */ + atomic_t seqno_in; /* PTB add - unacked reqs */ + struct semaphore pid_sem; /* PTB control setting pid */ + struct gendisk *disk; /* PTB for partitions */ + struct request_queue *q; /* PTB make queue internal */ + rwlock_t meta_lock; /* PTB add - spinlock meta data */ + atomic_t merge_requests; /* PTB local req blks limit - 1 */ + unsigned long reenable_time; /* PTB time to delayed reenable */ + void (*enable) (struct nbd_device *lo); + void (*reset) (struct nbd_device *lo, int i); + int (*disable) (struct nbd_device *lo); + int (*read_only) (struct nbd_device *lo); + void (*set_speed) (struct nbd_device *lo); + int (*hard_reset)(struct nbd_device *lo); + int (*soft_reset)(struct nbd_device *lo); + int (*reenable_delay) (struct nbd_device *lo, int delay); + }; + +#endif /* MAJOR_NR */ + + + +/* Pavel - This now IS in some kind of include file... */ + +/* PTB 132 */ +#define NBD_INIT_MAGIC 0x12345678 /* AMARIN */ +#define NBD_REQUEST_MAGIC 0x25609513 +#define NBD_REPLY_MAGIC 0x67446698 +/* Pavel - Do *not* use magics: 0x12560953 0x96744668. + */ + +#define NBD_DEV_MAGIC 0x68797548 + +#define NBD_REQUEST_MAGIC_T __u32 +#define NBD_REQUEST_TYPE_T __u32 +#define NBD_REQUEST_FROM_T __u64 +#define NBD_REQUEST_LEN_T __u32 +#define NBD_REQUEST_FLAGS_T __u32 +#define NBD_REQUEST_TIME_T __u64 +#define NBD_REQUEST_ZONE_T __u64 +#define NBD_REQUEST_SPECIAL_T __u32 + +#define NBD_REPLY_MAGIC_T __u32 +#define NBD_REPLY_ERROR_T __s32 +#define NBD_REPLY_FLAGS_T __u32 +#define NBD_REPLY_TIME_T __u64 +#define NBD_REPLY_ZONE_T __u64 + +#define NBD_REQUEST_HANDLE_T __u32 +#define NBD_REPLY_HANDLE_T __u32 + + typedef __u32 nbd_digest_t[4]; + + #define NBD_DIGEST_T nbd_digest_t + +#define NBD_REQUEST_DIGEST_T nbd_digest_t +#define NBD_REPLY_DIGEST_T nbd_digest_t + + +#define NBD_DIGEST_BITS 128 +#define NBD_DIGEST_LENGTH ((NBD_DIGEST_BITS)/8) +#define NBD_REQUEST_SEQNO_T __u32 + +struct nbd_request { + NBD_REQUEST_MAGIC_T magic; + NBD_REQUEST_TYPE_T type; /* == READ || == WRITE */ + NBD_REQUEST_HANDLE_T handle; + NBD_REQUEST_FROM_T from; /* 64 bit PTB 132 */ + NBD_REQUEST_LEN_T len; + + + +#define NBD_REQUEST_ERRORED 0x0800 +#define NBD_REQUEST_MD5SUM 0x8000 /* has a digest in it ..*/ +#define NBD_REQUEST_MD5_OK 0x10000 /* .. and equaled req's */ +#define NBD_REQUEST_IOCTL 0x40000 /* ioctl in len, arg in from */ +#define NBD_REQUEST_SPECIALRW 0x80000 /* 1 for w 0 for r on special */ + NBD_REQUEST_FLAGS_T flags; + NBD_REQUEST_TIME_T time; + NBD_REQUEST_ZONE_T zone; + NBD_REQUEST_SEQNO_T seqno; + union { + NBD_REQUEST_DIGEST_T digest; + } data; + NBD_REQUEST_SPECIAL_T special; + char dummy0[0]; + char dummy1[0] __attribute__ ((aligned (64))); +} __attribute__ ((packed)) ; + + #define NBD_REQUEST_LENGTH sizeof(struct nbd_request) + +struct nbd_reply { + NBD_REPLY_MAGIC_T magic; + NBD_REPLY_ERROR_T error; /* 0 = ok, else error */ + NBD_REPLY_HANDLE_T handle; /* handle you got from request */ + + + +#define NBD_REPLY_ERRORED 0x0800 +#define NBD_REPLY_MD5SUM 0x8000 /* has a digest in it .. */ +#define NBD_REPLY_MD5_OK 0x10000 /* .. and equaled req's */ +#define NBD_REPLY_CLOSE 0x20000 /* close cmd from server */ +#define NBD_REPLY_IOCTL 0x40000 /* ioctl in len, arg in from */ + NBD_REPLY_FLAGS_T flags; + NBD_REPLY_TIME_T time; + NBD_REPLY_ZONE_T zone; + union { + NBD_REPLY_DIGEST_T digest; + } data; + char dummy0[0]; + char dummy1[0] __attribute__ ((aligned (64))); +} __attribute__ ((packed)) ; + + #define NBD_REPLY_LENGTH sizeof(struct nbd_reply) + + #define NBD_BUFFER_DATA_OFFSET \ + ((NBD_REQUEST_LENGTH>NBD_REPLY_LENGTH)?NBD_REQUEST_LENGTH:NBD_REPLY_LENGTH) + + #ifdef MAJOR_NR + + // PTB forward declaration + static struct nbd_device nbd_dev[]; + + + static long wait_for_completion_timeout(struct completion *x, long timeout) + { + spin_lock_irq(&x->wait.lock); + if (!x->done && timeout > 0) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout > 0); + __remove_wait_queue(&x->wait, &wait); + } + if (x->done) { + x->done--; + if (timeout <= 0) + timeout = 1; + } + spin_unlock_irq(&x->wait.lock); + return timeout; + } + + static void end_request(struct request *req, int uptodate) { + + struct bio *bio; + struct nbd_device *lo = req->rq_disk->private_data; + static int rq_type(struct request *); + + if (rq_type(req) == IOCTL) { + // PTB this is the devices ioctl request + complete(req->waiting); + // PTB let the driver code return the req, etc. + return; + } + + /* unlock chained buffers */ + while ((bio = req->bio) != NULL) { + unsigned nsect = bio_sectors(bio); + blk_finished_io(nsect); + req->bio = bio->bi_next; + bio->bi_next = NULL; + bio_endio(bio, nsect << 9, uptodate ? 0 : -EIO); + } + + if (req->flags & REQ_SPECIAL) + // don't account specials + return; + + write_lock(&lo->altqueue_lock); + if (atomic_read(&lo->countq[READ]) + + atomic_read(&lo->countq[WRITE]) == 0) { + if (atomic_read(&lo->flags) & NBD_QBLOCKED) { + static int nbd_requeue(struct nbd_device *); + nbd_requeue(lo); + atomic_clear_mask(NBD_QBLOCKED, &lo->flags); + } + } + write_unlock(&lo->altqueue_lock); + } + + /* + * PTB This takes the spinlock itself! So call it with the io spinlock + * not held. + */ + static void end_request_lock(struct request *req, int uptodate) { + + unsigned long flags; + request_queue_t *q = req->q; + + spin_lock_irqsave(q->queue_lock, flags); + end_request(req, uptodate); + spin_unlock_irqrestore(q->queue_lock, flags); + } + + /* + * PTB Call this only with the io spinlock * held. + */ + static inline void nbd_end_request(struct request *req) { + + // PTB the kernel has only 2 queues, read and write, and it uses + // the cmd field to determine to which the req belongs. We add a + // seqno to it in nbd_do_req, so we reestablish it here. + static void rq_set_seqno(struct request *, int); + + rq_set_seqno(req, 0); // PTB Zero extra seqno info + end_request( req, (req->errors == 0) ? 1 : 0 ); + } + + /* + * PTB This takes the spinlock itself! So call it with the io spinlock + * not held. + */ + static void nbd_end_request_lock(struct request *req) { + + // PTB the kernel has only 2 queues, read and write, and it uses + // the cmd field to determine to which the req belongs. We add a + // seqno to it in nbd_do_req, so we reestablish it here. + static void rq_set_seqno(struct request *, int); + + rq_set_seqno(req, 0); // PTB Zero extra seqno info + end_request_lock( req, !req->errors ); + } + + extern int nbd_init_seqno(struct nbd_seqno *); + extern int nbd_init_speed(struct nbd_speed *); + extern int nbd_init_md(struct nbd_md *); + extern void nbd_init_proc(struct proc_dir_entry *res); + + #endif /* MAJOR_NR */ + +#endif /* LINUX_ENBD_H */ + + --- linux-2.5.64/include/linux/enbd_ioctl.h.pre-enbd Tue Mar 25 15:20:45 2003 +++ linux-2.5.64/include/linux/enbd_ioctl.h Mon Mar 24 22:52:26 2003 @@ -0,0 +1,56 @@ +#ifndef NBD_IOCTL_H +#define NBD_IOCTL_H 1 + +int nbd_ioctl_convert(int ioctl); +int nbd_ioctl_convert_inplace(int *ioctl); +int nbd_ioctl_revert(int ioctl); +int nbd_ioctl_size (int cmd, char *arg); +int nbd_ioctl_size_user (int cmd, char *arg); +#ifdef __KERNEL__ +int nbd_ioctl_copy_to_user (int cmd, char *arg, char *buf, int size); +int nbd_ioctl_copy_from_user (int cmd, char *buf, char *arg, int size); + +/* + * PTB object containing all the above methods, to be registered with + * the enbd.o module + */ +struct nbd_ioctl { +#define NBD_REMOTE_IOCTL_ENABLED 0x01 + unsigned long flags; + int (*convert) (int ioctl); + int (*convert_inplace)(int *ioctl); + int (*revert) (int ioctl); + int (*size) (int cmd, char *arg); + int (*size_user) (int cmd, char *arg); + int (*cp_to_user) (int cmd, char *arg, char *buf, int size); + int (*cp_from_user) (int cmd, char *buf, char *arg, int size); +}; + +struct nbd_ioctl_stub { + struct nbd_ioctl * remote; + int (*reg) (struct nbd_ioctl_stub *,struct nbd_ioctl *); + int (*unreg) (struct nbd_ioctl_stub *,struct nbd_ioctl *); +}; + +extern struct nbd_ioctl_stub nbd_remote_ioctl; +extern int nbd_init_ioctl_stub(struct nbd_ioctl_stub *); +#endif + +// PTB conversion table entries +struct ioctl_conv { + unsigned int old; // ioctl id, _IO or _IOR or _IOW or _IOWR + unsigned int new; // ioctl id +}; + +// PTB extended conversion table entries +struct ioctl_special { + int new; + int (*size) (char *arg); + int (*size_user) (char *arg); + int (*ioctl_copy_from_user)(char *buf, char*arg, int size); + int (*ioctl_copy_to_user)(char *arg, char*buf, int size); +}; + +extern int nbd_init_ioctl_stub(struct nbd_ioctl_stub *); + +#endif /* NBD_IOCTL_H */