]> git.pld-linux.org Git - packages/kernel.git/blob - kernel-tuxonice.patch
- renamed to kernel-wrr.patch
[packages/kernel.git] / kernel-tuxonice.patch
1 diff --git a/Documentation/power/tuxonice-internals.txt b/Documentation/power/tuxonice-internals.txt
2 new file mode 100644
3 index 0000000..afba75a
4 --- /dev/null
5 +++ b/Documentation/power/tuxonice-internals.txt
6 @@ -0,0 +1,469 @@
7 +                  TuxOnIce 3.0 Internal Documentation.
8 +                       Updated to 11 March 2008
9 +
10 +1.  Introduction.
11 +
12 +    TuxOnIce 3.0 is an addition to the Linux Kernel, designed to
13 +    allow the user to quickly shutdown and quickly boot a computer, without
14 +    needing to close documents or programs. It is equivalent to the
15 +    hibernate facility in some laptops. This implementation, however,
16 +    requires no special BIOS or hardware support.
17 +
18 +    The code in these files is based upon the original implementation
19 +    prepared by Gabor Kuti and additional work by Pavel Machek and a
20 +    host of others. This code has been substantially reworked by Nigel
21 +    Cunningham, again with the help and testing of many others, not the
22 +    least of whom is Michael Frank. At its heart, however, the operation is
23 +    essentially the same as Gabor's version.
24 +
25 +2.  Overview of operation.
26 +
27 +    The basic sequence of operations is as follows:
28 +
29 +       a. Quiesce all other activity.
30 +       b. Ensure enough memory and storage space are available, and attempt
31 +          to free memory/storage if necessary.
32 +       c. Allocate the required memory and storage space.
33 +       d. Write the image.
34 +       e. Power down.
35 +
36 +    There are a number of complicating factors which mean that things are
37 +    not as simple as the above would imply, however...
38 +
39 +    o The activity of each process must be stopped at a point where it will
40 +    not be holding locks necessary for saving the image, or unexpectedly
41 +    restart operations due to something like a timeout and thereby make
42 +    our image inconsistent.
43 +
44 +    o It is desirous that we sync outstanding I/O to disk before calculating
45 +    image statistics. This reduces corruption if one should suspend but
46 +    then not resume, and also makes later parts of the operation safer (see
47 +    below).
48 +
49 +    o We need to get as close as we can to an atomic copy of the data.
50 +    Inconsistencies in the image will result in inconsistent memory contents at
51 +    resume time, and thus in instability of the system and/or file system
52 +    corruption. This would appear to imply a maximum image size of one half of
53 +    the amount of RAM, but we have a solution... (again, below).
54 +
55 +    o In 2.6, we choose to play nicely with the other suspend-to-disk
56 +    implementations.
57 +
58 +3.  Detailed description of internals.
59 +
60 +    a. Quiescing activity.
61 +
62 +    Safely quiescing the system is achieved using three separate but related
63 +    aspects.
64 +
65 +    First, we note that the vast majority of processes don't need to run during
66 +    suspend. They can be 'frozen'. We therefore implement a refrigerator
67 +    routine, which processes enter and in which they remain until the cycle is
68 +    complete. Processes enter the refrigerator via try_to_freeze() invocations
69 +    at appropriate places.  A process cannot be frozen in any old place. It
70 +    must not be holding locks that will be needed for writing the image or
71 +    freezing other processes. For this reason, userspace processes generally
72 +    enter the refrigerator via the signal handling code, and kernel threads at
73 +    the place in their event loops where they drop locks and yield to other
74 +    processes or sleep.
75 +
76 +    The task of freezing processes is complicated by the fact that there can be
77 +    interdependencies between processes. Freezing process A before process B may
78 +    mean that process B cannot be frozen, because it stops at waiting for
79 +    process A rather than in the refrigerator. This issue is seen where
80 +    userspace waits on freezeable kernel threads or fuse filesystem threads. To
81 +    address this issue, we implement the following algorithm for quiescing
82 +    activity:
83 +
84 +       - Freeze filesystems (including fuse - userspace programs starting
85 +               new requests are immediately frozen; programs already running
86 +               requests complete their work before being frozen in the next
87 +               step)
88 +       - Freeze userspace
89 +       - Thaw filesystems (this is safe now that userspace is frozen and no
90 +               fuse requests are outstanding).
91 +       - Invoke sys_sync (noop on fuse).
92 +       - Freeze filesystems
93 +       - Freeze kernel threads
94 +
95 +    If we need to free memory, we thaw kernel threads and filesystems, but not
96 +    userspace. We can then free caches without worrying about deadlocks due to
97 +    swap files being on frozen filesystems or such like.
98 +
99 +    b. Ensure enough memory & storage are available.
100 +
101 +    We have a number of constraints to meet in order to be able to successfully
102 +    suspend and resume.
103 +
104 +    First, the image will be written in two parts, described below. One of these
105 +    parts needs to have an atomic copy made, which of course implies a maximum
106 +    size of one half of the amount of system memory. The other part ('pageset')
107 +    is not atomically copied, and can therefore be as large or small as desired.
108 +
109 +    Second, we have constraints on the amount of storage available. In these
110 +    calculations, we may also consider any compression that will be done. The
111 +    cryptoapi module allows the user to configure an expected compression ratio.
112 +   
113 +    Third, the user can specify an arbitrary limit on the image size, in
114 +    megabytes. This limit is treated as a soft limit, so that we don't fail the
115 +    attempt to suspend if we cannot meet this constraint.
116 +
117 +    c. Allocate the required memory and storage space.
118 +
119 +    Having done the initial freeze, we determine whether the above constraints
120 +    are met, and seek to allocate the metadata for the image. If the constraints
121 +    are not met, or we fail to allocate the required space for the metadata, we
122 +    seek to free the amount of memory that we calculate is needed and try again.
123 +    We allow up to four iterations of this loop before aborting the cycle. If we
124 +    do fail, it should only be because of a bug in TuxOnIce's calculations.
125 +    
126 +    These steps are merged together in the prepare_image function, found in
127 +    prepare_image.c. The functions are merged because of the cyclical nature
128 +    of the problem of calculating how much memory and storage is needed. Since
129 +    the data structures containing the information about the image must
130 +    themselves take memory and use storage, the amount of memory and storage
131 +    required changes as we prepare the image. Since the changes are not large,
132 +    only one or two iterations will be required to achieve a solution.
133 +
134 +    The recursive nature of the algorithm is miminised by keeping user space
135 +    frozen while preparing the image, and by the fact that our records of which
136 +    pages are to be saved and which pageset they are saved in use bitmaps (so
137 +    that changes in number or fragmentation of the pages to be saved don't
138 +    feedback via changes in the amount of memory needed for metadata). The
139 +    recursiveness is thus limited to any extra slab pages allocated to store the
140 +    extents that record storage used, and he effects of seeking to free memory.
141 +
142 +    d. Write the image.
143 +
144 +    We previously mentioned the need to create an atomic copy of the data, and
145 +    the half-of-memory limitation that is implied in this. This limitation is
146 +    circumvented by dividing the memory to be saved into two parts, called
147 +    pagesets.
148 +
149 +    Pageset2 contains the page cache - the pages on the active and inactive
150 +    lists. These pages aren't needed or modifed while TuxOnIce is running, so
151 +    they can be safely written without an atomic copy. They are therefore
152 +    saved first and reloaded last. While saving these pages, TuxOnIce carefully
153 +    ensures that the work of writing the pages doesn't make the image
154 +    inconsistent.
155 +
156 +    Once pageset2 has been saved, we prepare to do the atomic copy of remaining
157 +    memory. As part of the preparation, we power down drivers, thereby providing
158 +    them with the opportunity to have their state recorded in the image. The
159 +    amount of memory allocated by drivers for this is usually negligible, but if
160 +    DRI is in use, video drivers may require significants amounts. Ideally we
161 +    would be able to query drivers while preparing the image as to the amount of
162 +    memory they will need. Unfortunately no such mechanism exists at the time of
163 +    writing. For this reason, TuxOnIce allows the user to set an
164 +    'extra_pages_allowance', which is used to seek to ensure sufficient memory
165 +    is available for drivers at this point. TuxOnIce also lets the user set this
166 +    value to 0. In this case, a test driver suspend is done while preparing the
167 +    image, and the difference (plus a margin) used instead.
168 +
169 +    Having suspended the drivers, we save the CPU context before making an
170 +    atomic copy of pageset1, resuming the drivers and saving the atomic copy.
171 +    After saving the two pagesets, we just need to save our metadata before
172 +    powering down.
173 +
174 +    As we mentioned earlier, the contents of pageset2 pages aren't needed once
175 +    they've been saved. We therefore use them as the destination of our atomic
176 +    copy. In the unlikely event that pageset1 is larger, extra pages are
177 +    allocated while the image is being prepared. This is normally only a real
178 +    possibility when the system has just been booted and the page cache is
179 +    small.
180 +
181 +    This is where we need to be careful about syncing, however. Pageset2 will
182 +    probably contain filesystem meta data. If this is overwritten with pageset1
183 +    and then a sync occurs, the filesystem will be corrupted - at least until
184 +    resume time and another sync of the restored data. Since there is a
185 +    possibility that the user might not resume or (may it never be!) that
186 +    suspend might oops, we do our utmost to avoid syncing filesystems after
187 +    copying pageset1.
188 +
189 +    e. Power down.
190 +
191 +    Powering down uses standard kernel routines. TuxOnIce supports powering down
192 +    using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off.
193 +    Supporting suspend to ram (S3) as a power off option might sound strange,
194 +    but it allows the user to quickly get their system up and running again if
195 +    the battery doesn't run out (we just need to re-read the overwritten pages)
196 +    and if the battery does run out (or the user removes power), they can still
197 +    resume.
198 +
199 +4.  Data Structures.
200 +
201 +    TuxOnIce uses three main structures to store its metadata and configuration
202 +    information:
203 +
204 +    a) Pageflags bitmaps.
205 +
206 +    TuxOnIce records which pages will be in pageset1, pageset2, the destination
207 +    of the atomic copy and the source of the atomically restored image using
208 +    bitmaps. These bitmaps are created from order zero allocations to maximise
209 +    reliability. The individual pages are combined together with pointers to
210 +    form per-zone bitmaps, which are in turn combined with another layer of
211 +    pointers to construct the overall bitmap.
212 +
213 +    The pageset1 bitmap is thus easily stored in the image header for use at
214 +    resume time.
215 +
216 +    As mentioned above, using bitmaps also means that the amount of memory and
217 +    storage required for recording the above information is constant. This
218 +    greatly simplifies the work of preparing the image. In earlier versions of
219 +    TuxOnIce, extents were used to record which pages would be stored. In that
220 +    case, however, eating memory could result in greater fragmentation of the
221 +    lists of pages, which in turn required more memory to store the extents and
222 +    more storage in the image header. These could in turn require further
223 +    freeing of memory, and another iteration. All of this complexity is removed
224 +    by having bitmaps.
225 +
226 +    Bitmaps also make a lot of sense because TuxOnIce only ever iterates
227 +    through the lists. There is therefore no cost to not being able to find the
228 +    nth page in order 0 time. We only need to worry about the cost of finding
229 +    the n+1th page, given the location of the nth page. Bitwise optimisations
230 +    help here.
231 +
232 +    The data structure is: unsigned long ***.
233 +
234 +    b) Extents for block data.
235 +
236 +    TuxOnIce supports writing the image to multiple block devices. In the case
237 +    of swap, multiple partitions and/or files may be in use, and we happily use
238 +    them all. This is accomplished as follows:
239 +
240 +    Whatever the actual source of the allocated storage, the destination of the
241 +    image can be viewed in terms of one or more block devices, and on each
242 +    device, a list of sectors. To simplify matters, we only use contiguous,
243 +    PAGE_SIZE aligned sectors, like the swap code does.
244 +
245 +    Since sector numbers on each bdev may well not start at 0, it makes much
246 +    more sense to use extents here. Contiguous ranges of pages can thus be
247 +    represented in the extents by contiguous values.
248 +
249 +    Variations in block size are taken account of in transforming this data
250 +    into the parameters for bio submission.
251 +
252 +    We can thus implement a layer of abstraction wherein the core of TuxOnIce
253 +    doesn't have to worry about which device we're currently writing to or
254 +    where in the device we are. It simply requests that the next page in the
255 +    pageset or header be written, leaving the details to this lower layer.
256 +    The lower layer remembers where in the sequence of devices and blocks each
257 +    pageset starts. The header always starts at the beginning of the allocated
258 +    storage.
259 +
260 +    So extents are:
261 +
262 +    struct extent {
263 +      unsigned long minimum, maximum;
264 +      struct extent *next;
265 +    }
266 +
267 +    These are combined into chains of extents for a device:
268 +
269 +    struct extent_chain {
270 +      int size; /* size of the extent ie sum (max-min+1) */
271 +      int allocs, frees;
272 +      char *name;
273 +      struct extent *first, *last_touched;
274 +    };
275 +
276 +    For each bdev, we need to store a little more info:
277 +
278 +    struct suspend_bdev_info {
279 +       struct block_device *bdev;
280 +       dev_t dev_t;
281 +       int bmap_shift;
282 +       int blocks_per_page;
283 +    };
284 +
285 +    The dev_t is used to identify the device in the stored image. As a result,
286 +    we expect devices at resume time to have the same major and minor numbers
287 +    as they had while suspending.  This is primarily a concern where the user
288 +    utilises LVM for storage, as they will need to dmsetup their partitions in
289 +    such a way as to maintain this consistency at resume time.
290 +
291 +    bmap_shift and blocks_per_page record apply the effects of variations in
292 +    blocks per page settings for the filesystem and underlying bdev. For most
293 +    filesystems, these are the same, but for xfs, they can have independant
294 +    values.
295 +
296 +    Combining these two structures together, we have everything we need to
297 +    record what devices and what blocks on each device are being used to
298 +    store the image, and to submit i/o using bio_submit.
299 +
300 +    The last elements in the picture are a means of recording how the storage
301 +    is being used.
302 +
303 +    We do this first and foremost by implementing a layer of abstraction on
304 +    top of the devices and extent chains which allows us to view however many
305 +    devices there might be as one long storage tape, with a single 'head' that
306 +    tracks a 'current position' on the tape:
307 +
308 +    struct extent_iterate_state {
309 +      struct extent_chain *chains;
310 +      int num_chains;
311 +      int current_chain;
312 +      struct extent *current_extent;
313 +      unsigned long current_offset;
314 +    };
315 +
316 +    That is, *chains points to an array of size num_chains of extent chains.
317 +    For the filewriter, this is always a single chain. For the swapwriter, the
318 +    array is of size MAX_SWAPFILES.
319 +
320 +    current_chain, current_extent and current_offset thus point to the current
321 +    index in the chains array (and into a matching array of struct
322 +    suspend_bdev_info), the current extent in that chain (to optimise access),
323 +    and the current value in the offset.
324 +
325 +    The image is divided into three parts:
326 +    - The header
327 +    - Pageset 1
328 +    - Pageset 2
329 +
330 +    The header always starts at the first device and first block. We know its
331 +    size before we begin to save the image because we carefully account for
332 +    everything that will be stored in it.
333 +
334 +    The second pageset (LRU) is stored first. It begins on the next page after
335 +    the end of the header.
336 +
337 +    The first pageset is stored second. It's start location is only known once
338 +    pageset2 has been saved, since pageset2 may be compressed as it is written.
339 +    This location is thus recorded at the end of saving pageset2. It is page
340 +    aligned also.
341 +
342 +    Since this information is needed at resume time, and the location of extents
343 +    in memory will differ at resume time, this needs to be stored in a portable
344 +    way:
345 +
346 +    struct extent_iterate_saved_state {
347 +        int chain_num;
348 +        int extent_num;
349 +        unsigned long offset;
350 +    };
351 +
352 +    We can thus implement a layer of abstraction wherein the core of TuxOnIce
353 +    doesn't have to worry about which device we're currently writing to or
354 +    where in the device we are. It simply requests that the next page in the
355 +    pageset or header be written, leaving the details to this layer, and
356 +    invokes the routines to remember and restore the position, without having
357 +    to worry about the details of how the data is arranged on disk or such like.
358 +
359 +    c) Modules
360 +
361 +    One aim in designing TuxOnIce was to make it flexible. We wanted to allow
362 +    for the implementation of different methods of transforming a page to be
363 +    written to disk and different methods of getting the pages stored.
364 +
365 +    In early versions (the betas and perhaps Suspend1), compression support was
366 +    inlined in the image writing code, and the data structures and code for
367 +    managing swap were intertwined with the rest of the code. A number of people
368 +    had expressed interest in implementing image encryption, and alternative
369 +    methods of storing the image.
370 +
371 +    In order to achieve this, TuxOnIce was given a modular design.
372 +
373 +    A module is a single file which encapsulates the functionality needed
374 +    to transform a pageset of data (encryption or compression, for example),
375 +    or to write the pageset to a device. The former type of module is called
376 +    a 'page-transformer', the later a 'writer'.
377 +
378 +    Modules are linked together in pipeline fashion. There may be zero or more
379 +    page transformers in a pipeline, and there is always exactly one writer.
380 +    The pipeline follows this pattern:
381 +
382 +               ---------------------------------
383 +               |          TuxOnIce Core        |
384 +               ---------------------------------
385 +                               |
386 +                               |
387 +               ---------------------------------
388 +               |       Page transformer 1      |
389 +               ---------------------------------
390 +                               |
391 +                               |
392 +               ---------------------------------
393 +               |       Page transformer 2      |
394 +               ---------------------------------
395 +                               |
396 +                               |
397 +               ---------------------------------
398 +               |            Writer             |
399 +               ---------------------------------
400 +
401 +    During the writing of an image, the core code feeds pages one at a time
402 +    to the first module. This module performs whatever transformations it
403 +    implements on the incoming data, completely consuming the incoming data and
404 +    feeding output in a similar manner to the next module. A module may buffer
405 +    its output.
406 +
407 +    During reading, the pipeline works in the reverse direction. The core code
408 +    calls the first module with the address of a buffer which should be filled.
409 +    (Note that the buffer size is always PAGE_SIZE at this time). This module
410 +    will in turn request data from the next module and so on down until the
411 +    writer is made to read from the stored image.
412 +
413 +    Part of definition of the structure of a module thus looks like this:
414 +
415 +        int (*rw_init) (int rw, int stream_number);
416 +        int (*rw_cleanup) (int rw);
417 +        int (*write_chunk) (struct page *buffer_page);
418 +        int (*read_chunk) (struct page *buffer_page, int sync);
419 +
420 +    It should be noted that the _cleanup routine may be called before the
421 +    full stream of data has been read or written. While writing the image,
422 +    the user may (depending upon settings) choose to abort suspending, and
423 +    if we are in the midst of writing the last portion of the image, a portion
424 +    of the second pageset may be reread. This may also happen if an error
425 +    occurs and we seek to abort the process of writing the image.
426 +
427 +    The modular design is also useful in a number of other ways. It provides
428 +    a means where by we can add support for:
429 +
430 +    - providing overall initialisation and cleanup routines;
431 +    - serialising configuration information in the image header;
432 +    - providing debugging information to the user;
433 +    - determining memory and image storage requirements;
434 +    - dis/enabling components at run-time;
435 +    - configuring the module (see below);
436 +
437 +    ...and routines for writers specific to their work:
438 +    - Parsing a resume= location;
439 +    - Determining whether an image exists;
440 +    - Marking a resume as having been attempted;
441 +    - Invalidating an image;
442 +
443 +    Since some parts of the core - the user interface and storage manager
444 +    support - have use for some of these functions, they are registered as
445 +    'miscellaneous' modules as well.
446 +
447 +    d) Sysfs data structures.
448 +
449 +    This brings us naturally to support for configuring TuxOnIce. We desired to
450 +    provide a way to make TuxOnIce as flexible and configurable as possible.
451 +    The user shouldn't have to reboot just because they want to now suspend to
452 +    a file instead of a partition, for example.
453 +
454 +    To accomplish this, TuxOnIce implements a very generic means whereby the
455 +    core and modules can register new sysfs entries. All TuxOnIce entries use
456 +    a single _store and _show routine, both of which are found in sysfs.c in
457 +    the kernel/power directory. These routines handle the most common operations
458 +    - getting and setting the values of bits, integers, longs, unsigned longs
459 +    and strings in one place, and allow overrides for customised get and set
460 +    options as well as side-effect routines for all reads and writes.
461 +
462 +    When combined with some simple macros, a new sysfs entry can then be defined
463 +    in just a couple of lines:
464 +
465 +    { TOI_ATTR("progress_granularity", SYSFS_RW),
466 +      SYSFS_INT(&progress_granularity, 1, 2048)
467 +    },
468 +
469 +    This defines a sysfs entry named "progress_granularity" which is rw and
470 +    allows the user to access an integer stored at &progress_granularity, giving
471 +    it a value between 1 and 2048 inclusive.
472 +
473 +    Sysfs entries are registered under /sys/power/tuxonice, and entries for
474 +    modules are located in a subdirectory named after the module.
475 +
476 diff --git a/Documentation/power/tuxonice.txt b/Documentation/power/tuxonice.txt
477 new file mode 100644
478 index 0000000..c6d0778
479 --- /dev/null
480 +++ b/Documentation/power/tuxonice.txt
481 @@ -0,0 +1,758 @@
482 +       --- TuxOnIce, version 3.0 ---
483 +
484 +1.  What is it?
485 +2.  Why would you want it?
486 +3.  What do you need to use it?
487 +4.  Why not just use the version already in the kernel?
488 +5.  How do you use it?
489 +6.  What do all those entries in /sys/power/tuxonice do?
490 +7.  How do you get support?
491 +8.  I think I've found a bug. What should I do?
492 +9.  When will XXX be supported?
493 +10  How does it work?
494 +11. Who wrote TuxOnIce?
495 +
496 +1. What is it?
497 +
498 +   Imagine you're sitting at your computer, working away. For some reason, you
499 +   need to turn off your computer for a while - perhaps it's time to go home
500 +   for the day. When you come back to your computer next, you're going to want
501 +   to carry on where you left off. Now imagine that you could push a button and
502 +   have your computer store the contents of its memory to disk and power down.
503 +   Then, when you next start up your computer, it loads that image back into
504 +   memory and you can carry on from where you were, just as if you'd never
505 +   turned the computer off. You have far less time to start up, no reopening of
506 +   applications or finding what directory you put that file in yesterday.
507 +   That's what TuxOnIce does.
508 +
509 +   TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who,
510 +   with some help from Pavel Machek, got an early version going in 1999. The
511 +   project was then taken over by Florent Chabaud while still in alpha version
512 +   numbers. Nigel Cunningham came on the scene when Florent was unable to
513 +   continue, moving the project into betas, then 1.0, 2.0 and so on up to
514 +   the present series. During the 2.0 series, the name was contracted to
515 +   Suspend2 and the website suspend2.net created. Beginning around July 2007,
516 +   a transition to calling the software TuxOnIce was made, to seek to help
517 +   make it clear that TuxOnIce is more concerned with hibernation than suspend
518 +   to ram.
519 +
520 +   Pavel Machek's swsusp code, which was merged around 2.5.17 retains the
521 +   original name, and was essentially a fork of the beta code until Rafael
522 +   Wysocki came on the scene in 2005 and began to improve it further.
523 +
524 +2. Why would you want it?
525 +
526 +   Why wouldn't you want it?
527 +   
528 +   Being able to save the state of your system and quickly restore it improves
529 +   your productivity - you get a useful system in far less time than through
530 +   the normal boot process.
531 +   
532 +3. What do you need to use it?
533 +
534 +   a. Kernel Support.
535 +
536 +   i) The TuxOnIce patch.
537 +   
538 +   TuxOnIce is part of the Linux Kernel. This version is not part of Linus's
539 +   2.6 tree at the moment, so you will need to download the kernel source and
540 +   apply the latest patch. Having done that, enable the appropriate options in
541 +   make [menu|x]config (under Power Management Options - look for "Enhanced
542 +   Hibernation"), compile and install your kernel. TuxOnIce works with SMP,
543 +   Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64.
544 +
545 +   TuxOnIce patches are available from http://tuxonice.net.
546 +
547 +   ii) Compression support.
548 +
549 +   Compression support is implemented via the cryptoapi. You will therefore want
550 +   to select any Cryptoapi transforms that you want to use on your image from
551 +   the Cryptoapi menu while configuring your kernel. Part of the TuxOnIce patch
552 +   adds a new cryptoapi compression called LZF. We recommend the use of this
553 +   compression method - it is very fast and still achieves good compression.
554 +
555 +   You can also tell TuxOnIce to write it's image to an encrypted and/or
556 +   compressed filesystem/swap partition. In that case, you don't need to do
557 +   anything special for TuxOnIce when it comes to kernel configuration.
558 +
559 +   iii) Configuring other options.
560 +
561 +   While you're configuring your kernel, try to configure as much as possible
562 +   to build as modules. We recommend this because there are a number of drivers
563 +   that are still in the process of implementing proper power management
564 +   support. In those cases, the best way to work around their current lack is
565 +   to build them as modules and remove the modules while hibernating. You might
566 +   also bug the driver authors to get their support up to speed, or even help!
567 +
568 +   b. Storage.
569 +
570 +   i) Swap.
571 +
572 +   TuxOnIce can store the hibernation image in your swap partition, a swap file or
573 +   a combination thereof. Whichever combination you choose, you will probably
574 +   want to create enough swap space to store the largest image you could have,
575 +   plus the space you'd normally use for swap. A good rule of thumb would be
576 +   to calculate the amount of swap you'd want without using TuxOnIce, and then
577 +   add the amount of memory you have. This swapspace can be arranged in any way
578 +   you'd like. It can be in one partition or file, or spread over a number. The
579 +   only requirement is that they be active when you start a hibernation cycle.
580 +   
581 +   There is one exception to this requirement. TuxOnIce has the ability to turn
582 +   on one swap file or partition at the start of hibernating and turn it back off
583 +   at the end. If you want to ensure you have enough memory to store a image
584 +   when your memory is fully used, you might want to make one swap partition or
585 +   file for 'normal' use, and another for TuxOnIce to activate & deactivate
586 +   automatically. (Further details below).
587 +
588 +   ii) Normal files.
589 +
590 +   TuxOnIce includes a 'file allocator'. The file allocator can store your
591 +   image in a simple file. Since Linux has the concept of everything being a
592 +   file, this is more powerful than it initially sounds. If, for example, you
593 +   were to set up a network block device file, you could hibernate to a network
594 +   server. This has been tested and works to a point, but nbd itself isn't
595 +   stateless enough for our purposes.
596 +
597 +   Take extra care when setting up the file allocator. If you just type
598 +   commands without thinking and then try to hibernate, you could cause
599 +   irreversible corruption on your filesystems! Make sure you have backups.
600 +
601 +   Most people will only want to hibernate to a local file. To achieve that, do
602 +   something along the lines of:
603 +
604 +   echo "TuxOnIce" > /hibernation-file
605 +   dd if=/dev/zero bs=1M count=512 >> hibernation-file
606 +
607 +   This will create a 512MB file called /hibernation-file. To get TuxOnIce to use
608 +   it:
609 +
610 +   echo /hibernation-file > /sys/power/tuxonice/file/target
611 +
612 +   Then
613 +
614 +   cat /sys/power/tuxonice/resume
615 +
616 +   Put the results of this into your bootloader's configuration (see also step
617 +   C, below):
618 +
619 +   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
620 +   # cat /sys/power/tuxonice/resume
621 +   file:/dev/hda2:0x1e001
622 +   
623 +   In this example, we would edit the append= line of our lilo.conf|menu.lst
624 +   so that it included:
625 +
626 +   resume=file:/dev/hda2:0x1e001
627 +   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
628
629 +   For those who are thinking 'Could I make the file sparse?', the answer is
630 +   'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in
631 +   a sparse file while hibernating. In the longer term (post merge!), I'd like
632 +   to change things so that the file could be dynamically resized and have
633 +   holes filled as needed. Right now, however, that's not possible and not a
634 +   priority.
635 +
636 +   c. Bootloader configuration.
637 +   
638 +   Using TuxOnIce also requires that you add an extra parameter to 
639 +   your lilo.conf or equivalent. Here's an example for a swap partition:
640 +
641 +   append="resume=swap:/dev/hda1"
642 +
643 +   This would tell TuxOnIce that /dev/hda1 is a swap partition you 
644 +   have. TuxOnIce will use the swap signature of this partition as a
645 +   pointer to your data when you hibernate. This means that (in this example)
646 +   /dev/hda1 doesn't need to be _the_ swap partition where all of your data
647 +   is actually stored. It just needs to be a swap partition that has a
648 +   valid signature.
649 +
650 +   You don't need to have a swap partition for this purpose. TuxOnIce
651 +   can also use a swap file, but usage is a little more complex. Having made
652 +   your swap file, turn it on and do 
653 +
654 +   cat /sys/power/tuxonice/swap/headerlocations
655 +
656 +   (this assumes you've already compiled your kernel with TuxOnIce
657 +   support and booted it). The results of the cat command will tell you
658 +   what you need to put in lilo.conf:
659 +
660 +   For swap partitions like /dev/hda1, simply use resume=/dev/hda1.
661 +   For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d.
662 +
663 +   If the swapfile changes for any reason (it is moved to a different
664 +   location, it is deleted and recreated, or the filesystem is
665 +   defragmented) then you will have to check
666 +   /sys/power/tuxonice/swap/headerlocations for a new resume_block value.
667 +
668 +   Once you've compiled and installed the kernel and adjusted your bootloader
669 +   configuration, you should only need to reboot for the most basic part
670 +   of TuxOnIce to be ready.
671 +
672 +   If you only compile in the swap allocator, or only compile in the file
673 +   allocator, you don't need to add the "swap:" part of the resume=
674 +   parameters above. resume=/dev/hda2:0x242d will work just as well. If you
675 +   have compiled both and your storage is on swap, you can also use this
676 +   format (the swap allocator is the default allocator).
677 +
678 +   When compiling your kernel, one of the options in the 'Power Management
679 +   Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is
680 +   called 'Default resume partition'. This can be used to set a default value
681 +   for the resume= parameter.
682 +
683 +   d. The hibernate script.
684 +
685 +   Since the driver model in 2.6 kernels is still being developed, you may need
686 +   to do more than just configure TuxOnIce. Users of TuxOnIce usually start the
687 +   process via a script which prepares for the hibernation cycle, tells the
688 +   kernel to do its stuff and then restore things afterwards. This script might
689 +   involve:
690 +
691 +   - Switching to a text console and back if X doesn't like the video card
692 +     status on resume.
693 +   - Un/reloading drivers that don't play well with hibernation.
694 +  
695 +   Note that you might not be able to unload some drivers if there are 
696 +   processes using them. You might have to kill off processes that hold
697 +   devices open. Hint: if your X server accesses an USB mouse, doing a
698 +   'chvt' to a text console releases the device and you can unload the
699 +   module.
700 +
701 +   Check out the latest script (available on tuxonice.net).
702 +
703 +   e. The userspace user interface.
704 +
705 +   TuxOnIce has very limited support for displaying status if you only apply
706 +   the kernel patch - it can printk messages, but that is all. In addition,
707 +   some of the functions mentioned in this document (such as cancelling a cycle
708 +   or performing interactive debugging) are unavailable. To utilise these
709 +   functions, or simply get a nice display, you need the 'userui' component.
710 +   Userui comes in three flavours, usplash, fbsplash and text. Text should
711 +   work on any console. Usplash and fbsplash require the appropriate
712 +   (distro specific?) support.
713 +
714 +   To utilise a userui, TuxOnIce just needs to be told where to find the
715 +   userspace binary:
716 +
717 +   echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program
718 +
719 +   The hibernate script can do this for you, and a default value for this
720 +   setting can be configured when compiling the kernel. This path is also
721 +   stored in the image header, so if you have an initrd or initramfs, you can
722 +   use the userui during the first part of resuming (prior to the atomic
723 +   restore) by putting the binary in the same path in your initrd/ramfs.
724 +   Alternatively, you can put it in a different location and do an echo
725 +   similar to the above prior to the echo > do_resume. The value saved in the
726 +   image header will then be ignored.
727 +
728 +4. Why not just use the version already in the kernel?
729 +
730 +   The version in the vanilla kernel has a number of drawbacks. The most
731 +   serious of these are:
732 +       - it has a maximum image size of 1/2 total memory.
733 +       - it doesn't allocate storage until after it has snapshotted memory.
734 +         This means that you can't be sure hibernating will work until you
735 +         see it start to write the image.
736 +       - it performs all of it's I/O synchronously.
737 +       - it does not allow you to press escape to cancel a cycle
738 +       - it does not allow you to automatically swapon a file when
739 +         starting a cycle.
740 +       - it does not allow you to use multiple swap partitions.
741 +       - it does not allow you to use swapfiles.
742 +       - it does not allow you to use ordinary files.
743 +       - it just invalidates an image and continues to boot if you
744 +         accidentally boot the wrong kernel after hibernating.
745 +       - it doesn't support any sort of nice display while hibernating
746 +       - it is moving toward requiring that you have an initrd/initramfs
747 +         to ever have a hope of resuming (uswsusp). While uswsusp will
748 +         address some of the concerns above, it won't address all of them,
749 +          and will be more complicated to get set up.
750 +
751 +5. How do you use it?
752 +
753 +   A hibernation cycle can be started directly by doing:
754 +
755 +       echo > /sys/power/tuxonice/do_hibernate
756 +
757 +   In practice, though, you'll probably want to use the hibernate script
758 +   to unload modules, configure the kernel the way you like it and so on.
759 +   In that case, you'd do (as root):
760 +
761 +       hibernate
762 +
763 +   See the hibernate script's man page for more details on the options it
764 +   takes.
765 +
766 +   If you're using the text or splash user interface modules, one feature of
767 +   TuxOnIce that you might find useful is that you can press Escape at any time
768 +   during hibernating, and the process will be aborted.
769 +
770 +   Due to the way hibernation works, this means you'll have your system back and
771 +   perfectly usable almost instantly. The only exception is when it's at the
772 +   very end of writing the image. Then it will need to reload a small (usually
773 +   4-50MBs, depending upon the image characteristics) portion first.
774 +
775 +   Likewise, when resuming, you can press escape and resuming will be aborted.
776 +   The computer will then powerdown again according to settings at that time for
777 +   the powerdown method or rebooting.
778 +
779 +   You can change the settings for powering down while the image is being
780 +   written by pressing 'R' to toggle rebooting and 'O' to toggle between
781 +   suspending to ram and powering down completely).
782 +   
783 +   If you run into problems with resuming, adding the "noresume" option to
784 +   the kernel command line will let you skip the resume step and recover your
785 +   system. This option shouldn't normally be needed, because TuxOnIce modifies
786 +   the image header prior to the atomic restore, and will thus prompt you
787 +   if it detects that you've tried to resume an image before (this flag is
788 +   removed if you press Escape to cancel a resume, so you won't be prompted
789 +   then).
790 +
791 +   Recent kernels (2.6.24 onwards) add support for resuming from a different
792 +   kernel to the one that was hibernated (thanks to Rafael for his work on
793 +   this - I've just embraced and enhanced the support for TuxOnIce). This
794 +   should further reduce the need for you to use the noresume option.
795 +
796 +6. What do all those entries in /sys/power/tuxonice do?
797 +
798 +   /sys/power/tuxonice is the directory which contains files you can use to
799 +   tune and configure TuxOnIce to your liking. The exact contents of
800 +   the directory will depend upon the version of TuxOnIce you're
801 +   running and the options you selected at compile time. In the following
802 +   descriptions, names in brackets refer to compile time options.
803 +   (Note that they're all dependant upon you having selected CONFIG_TUXONICE
804 +   in the first place!).
805 +
806 +   Since the values of these settings can open potential security risks, the
807 +   writeable ones are accessible only to the root user. You may want to
808 +   configure sudo to allow you to invoke your hibernate script as an ordinary
809 +   user.
810
811 +   - checksum/enabled
812 +
813 +   Use cryptoapi hashing routines to verify that Pageset2 pages don't change
814 +   while we're saving the first part of the image, and to get any pages that
815 +   do change resaved in the atomic copy. This should normally not be needed,
816 +   but if you're seeing issues, please enable this. If your issues stop you
817 +   being able to resume, enable this option, hibernate and cancel the cycle
818 +   after the atomic copy is done. If the debugging info shows a non-zero
819 +   number of pages resaved, please report this to Nigel.
820
821 +   - compression/algorithm
822 +
823 +   Set the cryptoapi algorithm used for compressing the image.
824 +
825 +   - compression/expected_compression
826 +
827 +   These values allow you to set an expected compression ratio, which TuxOnice
828 +   will use in calculating whether it meets constraints on the image size. If
829 +   this expected compression ratio is not attained, the hibernation cycle will
830 +   abort, so it is wise to allow some spare. You can see what compression
831 +   ratio is achieved in the logs after hibernating.
832 +
833 +   - debug_info:
834 +  
835 +   This file returns information about your configuration that may be helpful
836 +   in diagnosing problems with hibernating.
837 +
838 +   - do_hibernate:
839 +
840 +   When anything is written to this file, the kernel side of TuxOnIce will
841 +   begin to attempt to write an image to disk and power down. You'll normally
842 +   want to run the hibernate script instead, to get modules unloaded first.
843 +
844 +   - do_resume:
845 +
846 +   When anything is written to this file TuxOnIce will attempt to read and
847 +   restore an image. If there is no image, it will return almost immediately.
848 +   If an image exists, the echo > will never return. Instead, the original
849 +   kernel context will be restored and the original echo > do_hibernate will
850 +   return.
851 +
852 +   - */enabled
853 +
854 +   These option can be used to temporarily disable various parts of TuxOnIce.
855 +
856 +   - extra_pages_allowance
857 +
858 +   When TuxOnIce does its atomic copy, it calls the driver model suspend
859 +   and resume methods. If you have DRI enabled with a driver such as fglrx,
860 +   this can result in the driver allocating a substantial amount of memory
861 +   for storing its state. Extra_pages_allowance tells TuxOnIce how much
862 +   extra memory it should ensure is available for those allocations. If
863 +   your attempts at hibernating end with a message in dmesg indicating that
864 +   insufficient extra pages were allowed, you need to increase this value.
865 +
866 +   - file/target:
867 +
868 +   Read this value to get the current setting. Write to it to point TuxOnice
869 +   at a new storage location for the file allocator. See section 3.b.ii above
870 +   for details of how to set up the file allocator.
871 +
872 +   - freezer_test
873 +
874 +   This entry can be used to get TuxOnIce to just test the freezer and prepare
875 +   an image without actually doing a hibernation cycle. It is useful for
876 +   diagnosing freezing and image preparation issues.
877 +
878 +   - image_exists:
879 +
880 +   Can be used in a script to determine whether a valid image exists at the
881 +   location currently pointed to by resume=. Returns up to three lines.
882 +   The first is whether an image exists (-1 for unsure, otherwise 0 or 1).
883 +   If an image eixsts, additional lines will return the machine and version.
884 +   Echoing anything to this entry removes any current image.
885 +
886 +   - image_size_limit:
887 +
888 +   The maximum size of hibernation image written to disk, measured in megabytes
889 +   (1024*1024).
890 +
891 +   - interface_version:
892 +
893 +   The value returned by this file can be used by scripts and configuration
894 +   tools to determine what entries should be looked for. The value is
895 +   incremented whenever an entry in /sys/power/tuxonice is obsoleted or 
896 +   added.
897 +
898 +   - last_result:
899 +
900 +   The result of the last hibernation cycle, as defined in
901 +   include/linux/suspend-debug.h with the values SUSPEND_ABORTED to
902 +   SUSPEND_KEPT_IMAGE. This is a bitmask.
903 +
904 +   - log_everything (CONFIG_PM_DEBUG):
905 +
906 +   Setting this option results in all messages printed being logged. Normally,
907 +   only a subset are logged, so as to not slow the process and not clutter the
908 +   logs. Useful for debugging. It can be toggled during a cycle by pressing
909 +   'L'.
910 +
911 +   - pause_between_steps (CONFIG_PM_DEBUG):
912 +
913 +   This option is used during debugging, to make TuxOnIce pause between
914 +   each step of the process. It is ignored when the nice display is on.
915 +
916 +   - powerdown_method:
917 +
918 +   Used to select a method by which TuxOnIce should powerdown after writing the
919 +   image. Currently:
920 +
921 +   0: Don't use ACPI to power off.
922 +   3: Attempt to enter Suspend-to-ram.
923 +   4: Attempt to enter ACPI S4 mode.
924 +   5: Attempt to power down via ACPI S5 mode.
925 +
926 +   Note that these options are highly dependant upon your hardware & software:
927 +
928 +   3: When succesful, your machine suspends to ram instead of powering off.
929 +      The advantage of using this mode is that it doesn't matter whether your
930 +      battery has enough charge to make it through to your next resume. If it
931 +      lasts, you will simply resume from suspend to ram (and the image on disk
932 +      will be discarded). If the battery runs out, you will resume from disk
933 +      instead. The disadvantage is that it takes longer than a normal
934 +      suspend-to-ram to enter the state, since the suspend-to-disk image needs
935 +      to be written first.
936 +   4/5: When successful, your machine will be off and comsume (almost) no power.
937 +      But it might still react to some external events like opening the lid or
938 +      trafic on  a network or usb device. For the bios, resume is then the same
939 +      as warm boot, similar to a situation where you used the command `reboot'
940 +      to reboot your machine. If your machine has problems on warm boot or if
941 +      you want to protect your machine with the bios password, this is probably
942 +      not the right choice. Mode 4 may be necessary on some machines where ACPI
943 +      wake up methods need to be run to properly reinitialise hardware after a
944 +      hibernation cycle.  
945 +   0: Switch the machine completely off. The only possible wakeup is the power
946 +      button. For the bios, resume is then the same as a cold boot, in
947 +      particular you would  have to provide your bios boot password if your
948 +      machine uses that feature for booting.
949 +
950 +   - progressbar_granularity_limit:
951 +
952 +   This option can be used to limit the granularity of the progress bar
953 +   displayed with a bootsplash screen. The value is the maximum number of
954 +   steps. That is, 10 will make the progress bar jump in 10% increments.
955 +
956 +   - reboot:
957 +
958 +   This option causes TuxOnIce to reboot rather than powering down
959 +   at the end of saving an image. It can be toggled during a cycle by pressing
960 +   'R'.
961 +
962 +   - resume_commandline:
963 +
964 +   This entry can be read after resuming to see the commandline that was used
965 +   when resuming began. You might use this to set up two bootloader entries
966 +   that are the same apart from the fact that one includes a extra append=
967 +   argument "at_work=1". You could then grep resume_commandline in your
968 +   post-resume scripts and configure networking (for example) differently
969 +   depending upon whether you're at home or work. resume_commandline can be
970 +   set to arbitrary text if you wish to remove sensitive contents.
971 +
972 +   - swap/swapfilename:
973 +
974 +   This entry is used to specify the swapfile or partition that
975 +   TuxOnIce will attempt to swapon/swapoff automatically. Thus, if
976 +   I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically
977 +   for my hibernation image, I would
978 +  
979 +   echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile
980 +
981 +   /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the
982 +   swapon and swapoff occur while other processes are frozen (including kswapd)
983 +   so this swap file will not be used up when attempting to free memory. The
984 +   parition/file is also given the highest priority, so other swapfiles/partitions
985 +   will only be used to save the image when this one is filled.
986 +
987 +   The value of this file is used by headerlocations along with any currently
988 +   activated swapfiles/partitions.
989 +
990 +   - swap/headerlocations:
991 +
992 +   This option tells you the resume= options to use for swap devices you
993 +   currently have activated. It is particularly useful when you only want to
994 +   use a swap file to store your image. See above for further details.
995 +
996 +   - toggle_process_nofreeze
997 +
998 +   This entry can be used to toggle the NOFREEZE flag on a process, to allow it
999 +   to run during hibernating. It should be used with extreme caution. There are
1000 +   strict limitations on what a process running during hibernation can do. This
1001 +   is really only intended for use by TuxOnice's helpers (userui in particular).
1002 +
1003 +   - userui_program
1004 +
1005 +   This entry is used to tell TuxOnice what userspace program to use for
1006 +   providing a user interface while hibernating. The program uses a netlink
1007 +   socket to pass messages back and forward to the kernel, allowing all of the
1008 +   functions formerly implemented in the kernel user interface components.
1009 +
1010 +   - user_interface/debug_sections (CONFIG_PM_DEBUG):
1011 +
1012 +   This value, together with the console log level, controls what debugging
1013 +   information is displayed. The console log level determines the level of
1014 +   detail, and this value determines what detail is displayed. This value is
1015 +   a bit vector, and the meaning of the bits can be found in the kernel tree
1016 +   in include/linux/tuxonice.h. It can be overridden using the kernel's
1017 +   command line option suspend_dbg.
1018 +
1019 +   - user_interface/default_console_level (CONFIG_PM_DEBUG):
1020 +
1021 +   This determines the value of the console log level at the start of a
1022 +   hibernation cycle. If debugging is compiled in, the console log level can be
1023 +   changed during a cycle by pressing the digit keys. Meanings are:
1024 +
1025 +   0: Nice display.
1026 +   1: Nice display plus numerical progress.
1027 +   2: Errors only.
1028 +   3: Low level debugging info.
1029 +   4: Medium level debugging info.
1030 +   5: High level debugging info.
1031 +   6: Verbose debugging info.
1032 +
1033 +   - user_interface/enable_escape:
1034 +
1035 +   Setting this to "1" will enable you abort a hibernation cycle or resuming by
1036 +   pressing escape, "0" (default) disables this feature. Note that enabling
1037 +   this option means that you cannot initiate a hibernation cycle and then walk
1038 +away
1039 +   from your computer, expecting it to be secure. With feature disabled,
1040 +   you can validly have this expectation once TuxOnice begins to write the
1041 +   image to disk. (Prior to this point, it is possible that TuxOnice might
1042 +   about because of failure to freeze all processes or because constraints
1043 +   on its ability to save the image are not met).
1044 +
1045 +   - version:
1046 +  
1047 +   The version of TuxOnIce you have compiled into the currently running kernel.
1048 +
1049 +7. How do you get support?
1050 +
1051 +   Glad you asked. TuxOnIce is being actively maintained and supported
1052 +   by Nigel (the guy doing most of the kernel coding at the moment), Bernard
1053 +   (who maintains the hibernate script and userspace user interface components)
1054 +   and its users.
1055 +
1056 +   Resources availble include HowTos, FAQs and a Wiki, all available via
1057 +   tuxonice.net.  You can find the mailing lists there.
1058 +
1059 +8. I think I've found a bug. What should I do?
1060 +
1061 +   By far and a way, the most common problems people have with TuxOnIce
1062 +   related to drivers not having adequate power management support. In this
1063 +   case, it is not a bug with TuxOnIce, but we can still help you. As we
1064 +   mentioned above, such issues can usually be worked around by building the
1065 +   functionality as modules and unloading them while hibernating. Please visit
1066 +   the Wiki for up-to-date lists of known issues and work arounds.
1067 +
1068 +   If this information doesn't help, try running:
1069 +
1070 +   hibernate --bug-report
1071 +
1072 +   ..and sending the output to the users mailing list.
1073 +
1074 +   Good information on how to provide us with useful information from an
1075 +   oops is found in the file REPORTING-BUGS, in the top level directory
1076 +   of the kernel tree. If you get an oops, please especially note the
1077 +   information about running what is printed on the screen through ksymoops.
1078 +   The raw information is useless.
1079 +
1080 +9. When will XXX be supported?
1081 +
1082 +   If there's a feature missing from TuxOnIce that you'd like, feel free to
1083 +   ask. We try to be obliging, within reason.
1084 +
1085 +   Patches are welcome. Please send to the list.
1086 +
1087 +10. How does it work?
1088 +
1089 +   TuxOnIce does its work in a number of steps.
1090 +
1091 +   a. Freezing system activity.
1092 +
1093 +   The first main stage in hibernating is to stop all other activity. This is
1094 +   achieved in stages. Processes are considered in fours groups, which we will
1095 +   describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE
1096 +   flag, kernel threads without this flag, userspace processes with the
1097 +   PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are
1098 +   untouched by the refrigerator code. They are allowed to run during hibernating
1099 +   and resuming, and are used to support user interaction, storage access or the
1100 +   like. Other kernel threads (those unneeded while hibernating) are frozen last.
1101 +   This leaves us with userspace processes that need to be frozen. When a
1102 +   process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on
1103 +   that process for the duration of that call. Processes that have this flag are
1104 +   frozen after processes without it, so that we can seek to ensure that dirty
1105 +   data is synced to disk as quickly as possible in a situation where other
1106 +   processes may be submitting writes at the same time. Freezing the processes
1107 +   that are submitting data stops new I/O from being submitted. Syncthreads can
1108 +   then cleanly finish their work. So the order is:
1109 +
1110 +   - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE;
1111 +   - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE);
1112 +   - Kernel processes without PF_NOFREEZE.
1113 +
1114 +   b. Eating memory.
1115 +
1116 +   For a successful hibernation cycle, you need to have enough disk space to store the
1117 +   image and enough memory for the various limitations of TuxOnIce's
1118 +   algorithm. You can also specify a maximum image size. In order to attain
1119 +   to those constraints, TuxOnIce may 'eat' memory. If, after freezing
1120 +   processes, the constraints aren't met, TuxOnIce will thaw all the
1121 +   other processes and begin to eat memory until its calculations indicate
1122 +   the constraints are met. It will then freeze processes again and recheck
1123 +   its calculations.
1124 +
1125 +   c. Allocation of storage.
1126 +
1127 +   Next, TuxOnIce allocates the storage that will be used to save
1128 +   the image.
1129 +
1130 +   The core of TuxOnIce knows nothing about how or where pages are stored. We
1131 +   therefore request the active allocator (remember you might have compiled in
1132 +   more than one!) to allocate enough storage for our expect image size. If
1133 +   this request cannot be fulfilled, we eat more memory and try again. If it
1134 +   is fulfiled, we seek to allocate additional storage, just in case our
1135 +   expected compression ratio (if any) isn't achieved. This time, however, we
1136 +   just continue if we can't allocate enough storage.
1137 +
1138 +   If these calls to our allocator change the characteristics of the image
1139 +   such that we haven't allocated enough memory, we also loop. (The allocator
1140 +   may well need to allocate space for its storage information).
1141 +
1142 +   d. Write the first part of the image.
1143 +
1144 +   TuxOnIce stores the image in two sets of pages called 'pagesets'.
1145 +   Pageset 2 contains pages on the active and inactive lists; essentially
1146 +   the page cache. Pageset 1 contains all other pages, including the kernel.
1147 +   We use two pagesets for one important reason: We need to make an atomic copy
1148 +   of the kernel to ensure consistency of the image. Without a second pageset,
1149 +   that would limit us to an image that was at most half the amount of memory
1150 +   available. Using two pagesets allows us to store a full image. Since pageset
1151 +   2 pages won't be needed in saving pageset 1, we first save pageset 2 pages.
1152 +   We can then make our atomic copy of the remaining pages using both pageset 2
1153 +   pages and any other pages that are free. While saving both pagesets, we are
1154 +   careful not to corrupt the image. Among other things, we use lowlevel block
1155 +   I/O routines that don't change the pagecache contents.
1156 +
1157 +   The next step, then, is writing pageset 2.
1158 +
1159 +   e. Suspending drivers and storing processor context.
1160 +
1161 +   Having written pageset2, TuxOnIce calls the power management functions to
1162 +   notify drivers of the hibernation, and saves the processor state in preparation
1163 +   for the atomic copy of memory we are about to make.
1164 +
1165 +   f. Atomic copy.
1166 +
1167 +   At this stage, everything else but the TuxOnIce code is halted. Processes
1168 +   are frozen or idling, drivers are quiesced and have stored (ideally and where
1169 +   necessary) their configuration in memory we are about to atomically copy.
1170 +   In our lowlevel architecture specific code, we have saved the CPU state.
1171 +   We can therefore now do our atomic copy before resuming drivers etc.
1172 +
1173 +   g. Save the atomic copy (pageset 1).
1174 +
1175 +   TuxOnice can then write the atomic copy of the remaining pages. Since we
1176 +   have copied the pages into other locations, we can continue to use the
1177 +   normal block I/O routines without fear of corruption our image.
1178 +
1179 +   f. Save the image header.
1180 +
1181 +   Nearly there! We save our settings and other parameters needed for
1182 +   reloading pageset 1 in an 'image header'. We also tell our allocator to
1183 +   serialise its data at this stage, so that it can reread the image at resume
1184 +   time.
1185 +
1186 +   g. Set the image header.
1187 +
1188 +   Finally, we edit the header at our resume= location. The signature is
1189 +   changed by the allocator to reflect the fact that an image exists, and to
1190 +   point to the start of that data if necessary (swap allocator).
1191 +
1192 +   h. Power down.
1193 +
1194 +   Or reboot if we're debugging and the appropriate option is selected.
1195 +
1196 +   Whew!
1197 +
1198 +   Reloading the image.
1199 +   --------------------
1200 +
1201 +   Reloading the image is essentially the reverse of all the above. We load
1202 +   our copy of pageset 1, being careful to choose locations that aren't going
1203 +   to be overwritten as we copy it back (We start very early in the boot
1204 +   process, so there are no other processes to quiesce here). We then copy
1205 +   pageset 1 back to its original location in memory and restore the process
1206 +   context. We are now running with the original kernel. Next, we reload the
1207 +   pageset 2 pages, free the memory and swap used by TuxOnIce, restore
1208 +   the pageset header and restart processes. Sounds easy in comparison to
1209 +   hibernating, doesn't it!
1210 +
1211 +   There is of course more to TuxOnIce than this, but this explanation
1212 +   should be a good start. If there's interest, I'll write further
1213 +   documentation on range pages and the low level I/O.
1214 +
1215 +11. Who wrote TuxOnIce?
1216 +
1217 +   (Answer based on the writings of Florent Chabaud, credits in files and
1218 +   Nigel's limited knowledge; apologies to anyone missed out!)
1219 +
1220 +   The main developers of TuxOnIce have been...
1221 +
1222 +   Gabor Kuti
1223 +   Pavel Machek
1224 +   Florent Chabaud
1225 +   Bernard Blackham
1226 +   Nigel Cunningham
1227 +
1228 +   Significant portions of swsusp, the code in the vanilla kernel which
1229 +   TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should
1230 +   also be expressed to him.
1231 +
1232 +   The above mentioned developers have been aided in their efforts by a host
1233 +   of hundreds, if not thousands of testers and people who have submitted bug
1234 +   fixes & suggestions. Of special note are the efforts of Michael Frank, who
1235 +   had his computers repetitively hibernate and resume for literally tens of
1236 +   thousands of cycles and developed scripts to stress the system and test
1237 +   TuxOnIce far beyond the point most of us (Nigel included!) would consider
1238 +   testing. His efforts have contributed as much to TuxOnIce as any of the
1239 +   names above.
1240 diff --git a/MAINTAINERS b/MAINTAINERS
1241 index e467758..1921b3a 100644
1242 --- a/MAINTAINERS
1243 +++ b/MAINTAINERS
1244 @@ -3935,6 +3935,13 @@ P:       Maciej W. Rozycki
1245  M:     macro@linux-mips.org
1246  S:     Maintained
1247  
1248 +TUXONICE (ENHANCED HIBERNATION)
1249 +P:     Nigel Cunningham
1250 +M:     nigel@tuxonice.net
1251 +L:     suspend2-devel@tuxonice.net
1252 +W:     http://tuxonice.net
1253 +S:     Maintained
1254 +
1255  U14-34F SCSI DRIVER
1256  P:     Dario Ballabio
1257  M:     ballabio_dario@emc.com
1258 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
1259 index ec08d83..c417de4 100644
1260 --- a/arch/x86/mm/fault.c
1261 +++ b/arch/x86/mm/fault.c
1262 @@ -25,6 +25,7 @@
1263  #include <linux/kprobes.h>
1264  #include <linux/uaccess.h>
1265  #include <linux/kdebug.h>
1266 +#include <linux/suspend.h>
1267  
1268  #include <asm/system.h>
1269  #include <asm/desc.h>
1270 @@ -49,6 +50,11 @@
1271  #define PF_RSVD                (1<<3)
1272  #define PF_INSTR       (1<<4)
1273  
1274 +#ifdef CONFIG_X86_32
1275 +int toi_faulted;
1276 +EXPORT_SYMBOL_GPL(toi_faulted);
1277 +#endif
1278 +
1279  static inline int notify_page_fault(struct pt_regs *regs)
1280  {
1281  #ifdef CONFIG_KPROBES
1282 @@ -599,6 +605,22 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1283  
1284         si_code = SEGV_MAPERR;
1285  
1286 +       /* During a TuxOnIce atomic copy, with DEBUG_SLAB, we will
1287 +        * get page faults where slab has been unmapped. Map them
1288 +        * temporarily and set the variable that tells TuxOnIce to
1289 +        * unmap afterwards.
1290 +        */
1291 +
1292 +#ifdef CONFIG_DEBUG_PAGEALLOC /* X86_32 only */
1293 +       if (unlikely(toi_running && !toi_faulted)) {
1294 +               struct page *page = NULL;
1295 +               toi_faulted = 1;
1296 +               page = virt_to_page(address);
1297 +               kernel_map_pages(page, 1, 1);
1298 +               return;
1299 +       }
1300 +#endif
1301 +
1302         if (notify_page_fault(regs))
1303                 return;
1304  
1305 diff --git a/crypto/Kconfig b/crypto/Kconfig
1306 index 69f1be6..21c95e0 100644
1307 --- a/crypto/Kconfig
1308 +++ b/crypto/Kconfig
1309 @@ -529,6 +529,14 @@ config CRYPTO_DEFLATE
1310           
1311           You will most probably want this if using IPSec.
1312  
1313 +config CRYPTO_LZF
1314 +       tristate "LZF compression algorithm"
1315 +       default y
1316 +       select CRYPTO_ALGAPI
1317 +       help
1318 +         This is the LZF algorithm. It is especially useful for TuxOnIce,
1319 +         because it achieves good compression quickly.
1320 +
1321  config CRYPTO_MICHAEL_MIC
1322         tristate "Michael MIC keyed digest algorithm"
1323         select CRYPTO_ALGAPI
1324 diff --git a/crypto/Makefile b/crypto/Makefile
1325 index 7cf3625..af17245 100644
1326 --- a/crypto/Makefile
1327 +++ b/crypto/Makefile
1328 @@ -60,6 +60,7 @@ obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
1329  obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
1330  obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
1331  obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
1332 +obj-$(CONFIG_CRYPTO_LZF) += lzf.o
1333  obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o
1334  obj-$(CONFIG_CRYPTO_LZO) += lzo.o
1335  
1336 diff --git a/crypto/lzf.c b/crypto/lzf.c
1337 new file mode 100644
1338 index 0000000..3e0aa8c
1339 --- /dev/null
1340 +++ b/crypto/lzf.c
1341 @@ -0,0 +1,326 @@
1342 +/*
1343 + * Cryptoapi LZF compression module.
1344 + *
1345 + * Copyright (c) 2004-2005 Nigel Cunningham <nigel at tuxonice net>
1346 + *
1347 + * based on the deflate.c file:
1348 + *
1349 + * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
1350 + *
1351 + * and upon the LZF compression module donated to the TuxOnIce project with
1352 + * the following copyright:
1353 + *
1354 + * This program is free software; you can redistribute it and/or modify it
1355 + * under the terms of the GNU General Public License as published by the Free
1356 + * Software Foundation; either version 2 of the License, or (at your option)
1357 + * any later version.
1358 + * Copyright (c) 2000-2003 Marc Alexander Lehmann <pcg@goof.com>
1359 + *
1360 + * Redistribution and use in source and binary forms, with or without modifica-
1361 + * tion, are permitted provided that the following conditions are met:
1362 + *
1363 + *   1.  Redistributions of source code must retain the above copyright notice,
1364 + *       this list of conditions and the following disclaimer.
1365 + *
1366 + *   2.  Redistributions in binary form must reproduce the above copyright
1367 + *       notice, this list of conditions and the following disclaimer in the
1368 + *       documentation and/or other materials provided with the distribution.
1369 + *
1370 + *   3.  The name of the author may not be used to endorse or promote products
1371 + *       derived from this software without specific prior written permission.
1372 + *
1373 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
1374 + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
1375 + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
1376 + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
1377 + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
1378 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
1379 + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
1380 + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
1381 + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
1382 + * OF THE POSSIBILITY OF SUCH DAMAGE.
1383 + *
1384 + * Alternatively, the contents of this file may be used under the terms of
1385 + * the GNU General Public License version 2 (the "GPL"), in which case the
1386 + * provisions of the GPL are applicable instead of the above. If you wish to
1387 + * allow the use of your version of this file only under the terms of the
1388 + * GPL and not to allow others to use your version of this file under the
1389 + * BSD license, indicate your decision by deleting the provisions above and
1390 + * replace them with the notice and other provisions required by the GPL. If
1391 + * you do not delete the provisions above, a recipient may use your version
1392 + * of this file under either the BSD or the GPL.
1393 + */
1394 +
1395 +#include <linux/kernel.h>
1396 +#include <linux/module.h>
1397 +#include <linux/init.h>
1398 +#include <linux/module.h>
1399 +#include <linux/crypto.h>
1400 +#include <linux/err.h>
1401 +#include <linux/vmalloc.h>
1402 +#include <asm/string.h>
1403 +
1404 +struct lzf_ctx {
1405 +       void *hbuf;
1406 +       unsigned int bufofs;
1407 +};
1408 +
1409 +/*
1410 + * size of hashtable is (1 << hlog) * sizeof (char *)
1411 + * decompression is independent of the hash table size
1412 + * the difference between 15 and 14 is very small
1413 + * for small blocks (and 14 is also faster).
1414 + * For a low-memory configuration, use hlog == 13;
1415 + * For best compression, use 15 or 16.
1416 + */
1417 +static const int hlog = 13;
1418 +
1419 +/*
1420 + * don't play with this unless you benchmark!
1421 + * decompression is not dependent on the hash function
1422 + * the hashing function might seem strange, just believe me
1423 + * it works ;)
1424 + */
1425 +static inline u16 first(const u8 *p)
1426 +{
1427 +       return ((p[0]) << 8) + p[1];
1428 +}
1429 +
1430 +static inline u16 next(u8 v, const u8 *p)
1431 +{
1432 +       return ((v) << 8) + p[2];
1433 +}
1434 +
1435 +static inline u32 idx(unsigned int h)
1436 +{
1437 +       return (((h ^ (h << 5)) >> (3*8 - hlog)) + h*3) & ((1 << hlog) - 1);
1438 +}
1439 +
1440 +/*
1441 + * IDX works because it is very similar to a multiplicative hash, e.g.
1442 + * (h * 57321 >> (3*8 - hlog))
1443 + * the next one is also quite good, albeit slow ;)
1444 + * (int)(cos(h & 0xffffff) * 1e6)
1445 + */
1446 +
1447 +static const int max_lit = (1 <<  5);
1448 +static const int max_off = (1 << 13);
1449 +static const int max_ref = ((1 <<  8) + (1 << 3));
1450 +
1451 +/*
1452 + * compressed format
1453 + *
1454 + * 000LLLLL <L+1>    ; literal
1455 + * LLLOOOOO oooooooo ; backref L
1456 + * 111OOOOO LLLLLLLL oooooooo ; backref L+7
1457 + *
1458 + */
1459 +
1460 +static void lzf_compress_exit(struct crypto_tfm *tfm)
1461 +{
1462 +       struct lzf_ctx *ctx = crypto_tfm_ctx(tfm);
1463 +
1464 +       if (!ctx->hbuf)
1465 +               return;
1466 +
1467 +       vfree(ctx->hbuf);
1468 +       ctx->hbuf = NULL;
1469 +}
1470 +
1471 +static int lzf_compress_init(struct crypto_tfm *tfm)
1472 +{
1473 +       struct lzf_ctx *ctx = crypto_tfm_ctx(tfm);
1474 +
1475 +       /* Get LZF ready to go */
1476 +       ctx->hbuf = vmalloc_32((1 << hlog) * sizeof(char *));
1477 +       if (ctx->hbuf)
1478 +               return 0;
1479 +
1480 +       printk(KERN_WARNING "Failed to allocate %ld bytes for lzf workspace\n",
1481 +                       (long) ((1 << hlog) * sizeof(char *)));
1482 +       return -ENOMEM;
1483 +}
1484 +
1485 +static int lzf_compress(struct crypto_tfm *tfm, const u8 *in_data,
1486 +               unsigned int in_len, u8 *out_data, unsigned int *out_len)
1487 +{
1488 +       struct lzf_ctx *ctx = crypto_tfm_ctx(tfm);
1489 +       const u8 **htab = ctx->hbuf;
1490 +       const u8 **hslot;
1491 +       const u8 *ip = in_data;
1492 +       u8 *op = out_data;
1493 +       const u8 *in_end = ip + in_len;
1494 +       u8 *out_end = op + *out_len - 3;
1495 +       const u8 *ref;
1496 +
1497 +       unsigned int hval = first(ip);
1498 +       unsigned long off;
1499 +       int lit = 0;
1500 +
1501 +       memset(htab, 0, sizeof(htab));
1502 +
1503 +       for (;;) {
1504 +               if (ip < in_end - 2) {
1505 +                       hval = next(hval, ip);
1506 +                       hslot = htab + idx(hval);
1507 +                       ref = *hslot;
1508 +                       *hslot = ip;
1509 +
1510 +                       off = ip - ref - 1;
1511 +                       if (off < max_off
1512 +                           && ip + 4 < in_end && ref > in_data
1513 +                           && *(u16 *) ref == *(u16 *) ip && ref[2] == ip[2]
1514 +                           ) {
1515 +                               /* match found at *ref++ */
1516 +                               unsigned int len = 2;
1517 +                               unsigned int maxlen = in_end - ip - len;
1518 +                               maxlen = maxlen > max_ref ? max_ref : maxlen;
1519 +
1520 +                               do
1521 +                                       len++;
1522 +                               while (len < maxlen && ref[len] == ip[len]);
1523 +
1524 +                               if (op + lit + 1 + 3 >= out_end) {
1525 +                                       *out_len = PAGE_SIZE;
1526 +                                       return 0;
1527 +                               }
1528 +
1529 +                               if (lit) {
1530 +                                       *op++ = lit - 1;
1531 +                                       lit = -lit;
1532 +                                       do {
1533 +                                               *op++ = ip[lit];
1534 +                                       } while (++lit);
1535 +                               }
1536 +
1537 +                               len -= 2;
1538 +                               ip++;
1539 +
1540 +                               if (len < 7) {
1541 +                                       *op++ = (off >> 8) + (len << 5);
1542 +                               } else {
1543 +                                       *op++ = (off >> 8) + (7 << 5);
1544 +                                       *op++ = len - 7;
1545 +                               }
1546 +
1547 +                               *op++ = off;
1548 +
1549 +                               ip += len;
1550 +                               hval = first(ip);
1551 +                               hval = next(hval, ip);
1552 +                               htab[idx(hval)] = ip;
1553 +                               ip++;
1554 +                               continue;
1555 +                       }
1556 +               } else if (ip == in_end)
1557 +                       break;
1558 +
1559 +               /* one more literal byte we must copy */
1560 +               lit++;
1561 +               ip++;
1562 +
1563 +               if (lit == max_lit) {
1564 +                       if (op + 1 + max_lit >= out_end) {
1565 +                               *out_len = PAGE_SIZE;
1566 +                               return 0;
1567 +                       }
1568 +
1569 +                       *op++ = max_lit - 1;
1570 +                       memcpy(op, ip - max_lit, max_lit);
1571 +                       op += max_lit;
1572 +                       lit = 0;
1573 +               }
1574 +       }
1575 +
1576 +       if (lit) {
1577 +               if (op + lit + 1 >= out_end) {
1578 +                       *out_len = PAGE_SIZE;
1579 +                       return 0;
1580 +               }
1581 +
1582 +               *op++ = lit - 1;
1583 +               lit = -lit;
1584 +               do {
1585 +                       *op++ = ip[lit];
1586 +               } while (++lit);
1587 +       }
1588 +
1589 +       *out_len = op - out_data;
1590 +       return 0;
1591 +}
1592 +
1593 +static int lzf_decompress(struct crypto_tfm *tfm, const u8 *src,
1594 +               unsigned int slen, u8 *dst, unsigned int *dlen)
1595 +{
1596 +       u8 const *ip = src;
1597 +       u8 *op = dst;
1598 +       u8 const *const in_end = ip + slen;
1599 +       u8 *const out_end = op + *dlen;
1600 +
1601 +       *dlen = PAGE_SIZE;
1602 +       do {
1603 +               unsigned int ctrl = *ip++;
1604 +
1605 +               if (ctrl < (1 << 5)) {
1606 +                       /* literal run */
1607 +                       ctrl++;
1608 +
1609 +                       if (op + ctrl > out_end)
1610 +                               return 0;
1611 +                       memcpy(op, ip, ctrl);
1612 +                       op += ctrl;
1613 +                       ip += ctrl;
1614 +               } else {        /* back reference */
1615 +
1616 +                       unsigned int len = ctrl >> 5;
1617 +
1618 +                       u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
1619 +
1620 +                       if (len == 7)
1621 +                               len += *ip++;
1622 +
1623 +                       ref -= *ip++;
1624 +                       len += 2;
1625 +
1626 +                       if (op + len > out_end || ref < (u8 *) dst)
1627 +                               return 0;
1628 +
1629 +                       do {
1630 +                               *op++ = *ref++;
1631 +                       } while (--len);
1632 +               }
1633 +       } while (op < out_end && ip < in_end);
1634 +
1635 +       *dlen = op - (u8 *) dst;
1636 +       return 0;
1637 +}
1638 +
1639 +static struct crypto_alg alg = {
1640 +       .cra_name = "lzf",
1641 +       .cra_flags = CRYPTO_ALG_TYPE_COMPRESS,
1642 +       .cra_ctxsize = sizeof(struct lzf_ctx),
1643 +       .cra_module = THIS_MODULE,
1644 +       .cra_list = LIST_HEAD_INIT(alg.cra_list),
1645 +       .cra_init = lzf_compress_init,
1646 +       .cra_exit = lzf_compress_exit,
1647 +       .cra_u = { .compress = {
1648 +       .coa_compress = lzf_compress,
1649 +       .coa_decompress = lzf_decompress } }
1650 +};
1651 +
1652 +static int __init init(void)
1653 +{
1654 +       return crypto_register_alg(&alg);
1655 +}
1656 +
1657 +static void __exit fini(void)
1658 +{
1659 +       crypto_unregister_alg(&alg);
1660 +}
1661 +
1662 +module_init(init);
1663 +module_exit(fini);
1664 +
1665 +MODULE_LICENSE("GPL");
1666 +MODULE_DESCRIPTION("LZF Compression Algorithm");
1667 +MODULE_AUTHOR("Marc Alexander Lehmann & Nigel Cunningham");
1668 diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
1669 index d6365a9..03e1eff 100644
1670 --- a/drivers/macintosh/via-pmu.c
1671 +++ b/drivers/macintosh/via-pmu.c
1672 @@ -39,7 +39,6 @@
1673  #include <linux/interrupt.h>
1674  #include <linux/device.h>
1675  #include <linux/sysdev.h>
1676 -#include <linux/freezer.h>
1677  #include <linux/syscalls.h>
1678  #include <linux/suspend.h>
1679  #include <linux/cpu.h>
1680 diff --git a/drivers/md/md.c b/drivers/md/md.c
1681 index 61ccbd2..5083aae 100644
1682 --- a/drivers/md/md.c
1683 +++ b/drivers/md/md.c
1684 @@ -5623,6 +5623,8 @@ void md_do_sync(mddev_t *mddev)
1685                         last_mark = next;
1686                 }
1687  
1688 +               while (freezer_is_on())
1689 +                       yield();
1690  
1691                 if (kthread_should_stop())
1692                         goto interrupted;
1693 diff --git a/fs/buffer.c b/fs/buffer.c
1694 index 39ff144..915fc52 100644
1695 --- a/fs/buffer.c
1696 +++ b/fs/buffer.c
1697 @@ -247,6 +247,93 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb)
1698  }
1699  EXPORT_SYMBOL(thaw_bdev);
1700  
1701 +/* #define DEBUG_FS_FREEZING */
1702 +
1703 +/**
1704 + * freeze_filesystems - lock all filesystems and force them into a consistent
1705 + * state
1706 + */
1707 +void freeze_filesystems(int which)
1708 +{
1709 +       struct super_block *sb;
1710 +
1711 +       lockdep_off();
1712 +
1713 +       /*
1714 +        * Freeze in reverse order so filesystems dependant upon others are
1715 +        * frozen in the right order (eg. loopback on ext3).
1716 +        */
1717 +       list_for_each_entry_reverse(sb, &super_blocks, s_list) {
1718 +#ifdef DEBUG_FS_FREEZING
1719 +               printk(KERN_INFO "Considering %s.%s: (root %p, bdev %x)",
1720 +                       sb->s_type->name ? sb->s_type->name : "?",
1721 +                       sb->s_subtype ? sb->s_subtype : "", sb->s_root,
1722 +                       sb->s_bdev ? sb->s_bdev->bd_dev : 0);
1723 +#endif
1724 +
1725 +               if (sb->s_type->fs_flags & FS_IS_FUSE &&
1726 +                   sb->s_frozen == SB_UNFROZEN &&
1727 +                   which & FS_FREEZER_FUSE) {
1728 +                       sb->s_frozen = SB_FREEZE_TRANS;
1729 +                       sb->s_flags |= MS_FROZEN;
1730 +                       printk("Fuse filesystem done.\n");
1731 +                       continue;
1732 +               }
1733 +
1734 +               if (!sb->s_root || !sb->s_bdev ||
1735 +                   (sb->s_frozen == SB_FREEZE_TRANS) ||
1736 +                   (sb->s_flags & MS_RDONLY) ||
1737 +                   (sb->s_flags & MS_FROZEN) ||
1738 +                   !(which & FS_FREEZER_NORMAL)) {
1739 +#ifdef DEBUG_FS_FREEZING
1740 +                       printk(KERN_INFO "Nope.\n");
1741 +#endif
1742 +                       continue;
1743 +               }
1744 +
1745 +#ifdef DEBUG_FS_FREEZING
1746 +               printk(KERN_INFO "Freezing %x... ", sb->s_bdev->bd_dev);
1747 +#endif
1748 +               freeze_bdev(sb->s_bdev);
1749 +               sb->s_flags |= MS_FROZEN;
1750 +#ifdef DEBUG_FS_FREEZING
1751 +               printk(KERN_INFO "Done.\n");
1752 +#endif
1753 +       }
1754 +
1755 +       lockdep_on();
1756 +}
1757 +
1758 +/**
1759 + * thaw_filesystems - unlock all filesystems
1760 + */
1761 +void thaw_filesystems(int which)
1762 +{
1763 +       struct super_block *sb;
1764 +
1765 +       lockdep_off();
1766 +
1767 +       list_for_each_entry(sb, &super_blocks, s_list) {
1768 +               if (!(sb->s_flags & MS_FROZEN))
1769 +                       continue;
1770 +
1771 +               if (sb->s_type->fs_flags & FS_IS_FUSE) {
1772 +                       if (!(which & FS_FREEZER_FUSE))
1773 +                               continue;
1774 +
1775 +                       sb->s_frozen = SB_UNFROZEN;
1776 +               } else {
1777 +                       if (!(which & FS_FREEZER_NORMAL))
1778 +                               continue;
1779 +
1780 +                       thaw_bdev(sb->s_bdev, sb);
1781 +               }
1782 +               sb->s_flags &= ~MS_FROZEN;
1783 +       }
1784 +
1785 +       lockdep_on();
1786 +}
1787 +
1788  /*
1789   * Various filesystems appear to want __find_get_block to be non-blocking.
1790   * But it's the page lock which protects the buffers.  To get around this,
1791 diff --git a/fs/fuse/control.c b/fs/fuse/control.c
1792 index 105d4a2..57eeca4 100644
1793 --- a/fs/fuse/control.c
1794 +++ b/fs/fuse/control.c
1795 @@ -207,6 +207,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
1796  static struct file_system_type fuse_ctl_fs_type = {
1797         .owner          = THIS_MODULE,
1798         .name           = "fusectl",
1799 +       .fs_flags       = FS_IS_FUSE,
1800         .get_sb         = fuse_ctl_get_sb,
1801         .kill_sb        = fuse_ctl_kill_sb,
1802  };
1803 diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
1804 index af63980..d417551 100644
1805 --- a/fs/fuse/dev.c
1806 +++ b/fs/fuse/dev.c
1807 @@ -7,6 +7,7 @@
1808  */
1809  
1810  #include "fuse_i.h"
1811 +#include "fuse.h"
1812  
1813  #include <linux/init.h>
1814  #include <linux/module.h>
1815 @@ -16,6 +17,7 @@
1816  #include <linux/pagemap.h>
1817  #include <linux/file.h>
1818  #include <linux/slab.h>
1819 +#include <linux/freezer.h>
1820  
1821  MODULE_ALIAS_MISCDEV(FUSE_MINOR);
1822  
1823 @@ -723,6 +725,8 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1824         if (!fc)
1825                 return -EPERM;
1826  
1827 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_dev_read");
1828 +
1829   restart:
1830         spin_lock(&fc->lock);
1831         err = -EAGAIN;
1832 @@ -849,6 +853,9 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1833         if (!fc)
1834                 return -EPERM;
1835  
1836 +       FUSE_MIGHT_FREEZE(iocb->ki_filp->f_mapping->host->i_sb,
1837 +                       "fuse_dev_write");
1838 +
1839         fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1840         if (nbytes < sizeof(struct fuse_out_header))
1841                 return -EINVAL;
1842 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
1843 index c4807b3..d8b4526 100644
1844 --- a/fs/fuse/dir.c
1845 +++ b/fs/fuse/dir.c
1846 @@ -7,12 +7,14 @@
1847  */
1848  
1849  #include "fuse_i.h"
1850 +#include "fuse.h"
1851  
1852  #include <linux/pagemap.h>
1853  #include <linux/file.h>
1854  #include <linux/gfp.h>
1855  #include <linux/sched.h>
1856  #include <linux/namei.h>
1857 +#include <linux/freezer.h>
1858  
1859  #if BITS_PER_LONG >= 64
1860  static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
1861 @@ -176,6 +178,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
1862                         return 0;
1863  
1864                 fc = get_fuse_conn(inode);
1865 +
1866 +               FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_dentry_revalidate");
1867 +
1868                 req = fuse_get_req(fc);
1869                 if (IS_ERR(req))
1870                         return 0;
1871 @@ -271,6 +276,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
1872         if (IS_ERR(req))
1873                 return ERR_CAST(req);
1874  
1875 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_lookup");
1876 +
1877         forget_req = fuse_get_req(fc);
1878         if (IS_ERR(forget_req)) {
1879                 fuse_put_request(fc, req);
1880 @@ -361,6 +368,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
1881         if (IS_ERR(forget_req))
1882                 return PTR_ERR(forget_req);
1883  
1884 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_create_open");
1885 +
1886         req = fuse_get_req(fc);
1887         err = PTR_ERR(req);
1888         if (IS_ERR(req))
1889 @@ -447,6 +456,8 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
1890         int err;
1891         struct fuse_req *forget_req;
1892  
1893 +       FUSE_MIGHT_FREEZE(dir->i_sb, "create_new_entry");
1894 +
1895         forget_req = fuse_get_req(fc);
1896         if (IS_ERR(forget_req)) {
1897                 fuse_put_request(fc, req);
1898 @@ -544,7 +555,11 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
1899  {
1900         struct fuse_mkdir_in inarg;
1901         struct fuse_conn *fc = get_fuse_conn(dir);
1902 -       struct fuse_req *req = fuse_get_req(fc);
1903 +       struct fuse_req *req;
1904 +
1905 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_mkdir");
1906 +
1907 +       req = fuse_get_req(fc);
1908         if (IS_ERR(req))
1909                 return PTR_ERR(req);
1910  
1911 @@ -564,7 +579,11 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
1912  {
1913         struct fuse_conn *fc = get_fuse_conn(dir);
1914         unsigned len = strlen(link) + 1;
1915 -       struct fuse_req *req = fuse_get_req(fc);
1916 +       struct fuse_req *req;
1917 +
1918 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_symlink");
1919 +
1920 +       req = fuse_get_req(fc);
1921         if (IS_ERR(req))
1922                 return PTR_ERR(req);
1923  
1924 @@ -581,7 +600,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
1925  {
1926         int err;
1927         struct fuse_conn *fc = get_fuse_conn(dir);
1928 -       struct fuse_req *req = fuse_get_req(fc);
1929 +       struct fuse_req *req;
1930 +
1931 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_unlink");
1932 +
1933 +       req = fuse_get_req(fc);
1934         if (IS_ERR(req))
1935                 return PTR_ERR(req);
1936  
1937 @@ -612,7 +635,11 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
1938  {
1939         int err;
1940         struct fuse_conn *fc = get_fuse_conn(dir);
1941 -       struct fuse_req *req = fuse_get_req(fc);
1942 +       struct fuse_req *req;
1943 +
1944 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_rmdir");
1945 +
1946 +       req = fuse_get_req(fc);
1947         if (IS_ERR(req))
1948                 return PTR_ERR(req);
1949  
1950 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
1951 index 676b0bc..56bb4eb 100644
1952 --- a/fs/fuse/file.c
1953 +++ b/fs/fuse/file.c
1954 @@ -7,11 +7,13 @@
1955  */
1956  
1957  #include "fuse_i.h"
1958 +#include "fuse.h"
1959  
1960  #include <linux/pagemap.h>
1961  #include <linux/slab.h>
1962  #include <linux/kernel.h>
1963  #include <linux/sched.h>
1964 +#include <linux/freezer.h>
1965  
1966  static const struct file_operations fuse_direct_io_file_operations;
1967  
1968 @@ -23,6 +25,8 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
1969         struct fuse_req *req;
1970         int err;
1971  
1972 +       FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_send_open");
1973 +
1974         req = fuse_get_req(fc);
1975         if (IS_ERR(req))
1976                 return PTR_ERR(req);
1977 @@ -546,6 +550,8 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
1978         if (is_bad_inode(inode))
1979                 return -EIO;
1980  
1981 +       FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_commit_write");
1982 +
1983         req = fuse_get_req(fc);
1984         if (IS_ERR(req))
1985                 return PTR_ERR(req);
1986 @@ -639,6 +645,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1987         if (is_bad_inode(inode))
1988                 return -EIO;
1989  
1990 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_direct_io");
1991 +
1992         req = fuse_get_req(fc);
1993         if (IS_ERR(req))
1994                 return PTR_ERR(req);
1995 @@ -791,6 +799,8 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
1996         struct fuse_lk_out outarg;
1997         int err;
1998  
1999 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_getlk");
2000 +
2001         req = fuse_get_req(fc);
2002         if (IS_ERR(req))
2003                 return PTR_ERR(req);
2004 @@ -821,6 +831,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2005         if (fl->fl_flags & FL_CLOSE)
2006                 return 0;
2007  
2008 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_setlk");
2009 +
2010         req = fuse_get_req(fc);
2011         if (IS_ERR(req))
2012                 return PTR_ERR(req);
2013 @@ -885,6 +897,8 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2014         if (!inode->i_sb->s_bdev || fc->no_bmap)
2015                 return 0;
2016  
2017 +       FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_bmap");
2018 +
2019         req = fuse_get_req(fc);
2020         if (IS_ERR(req))
2021                 return 0;
2022 diff --git a/fs/fuse/fuse.h b/fs/fuse/fuse.h
2023 new file mode 100644
2024 index 0000000..170e49a
2025 --- /dev/null
2026 +++ b/fs/fuse/fuse.h
2027 @@ -0,0 +1,13 @@
2028 +#define FUSE_MIGHT_FREEZE(superblock, desc) \
2029 +do { \
2030 +       int printed = 0; \
2031 +       while (superblock->s_frozen != SB_UNFROZEN) { \
2032 +               if (!printed) { \
2033 +                       printk(KERN_INFO "%d frozen in " desc ".\n", \
2034 +                                               current->pid); \
2035 +                       printed = 1; \
2036 +               } \
2037 +               try_to_freeze(); \
2038 +               yield(); \
2039 +       } \
2040 +} while (0)
2041 diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
2042 index 033f7bd..ac6805a 100644
2043 --- a/fs/fuse/inode.c
2044 +++ b/fs/fuse/inode.c
2045 @@ -702,7 +702,7 @@ static int fuse_get_sb(struct file_system_type *fs_type,
2046  static struct file_system_type fuse_fs_type = {
2047         .owner          = THIS_MODULE,
2048         .name           = "fuse",
2049 -       .fs_flags       = FS_HAS_SUBTYPE,
2050 +       .fs_flags       = FS_HAS_SUBTYPE | FS_IS_FUSE,
2051         .get_sb         = fuse_get_sb,
2052         .kill_sb        = kill_anon_super,
2053  };
2054 @@ -721,7 +721,7 @@ static struct file_system_type fuseblk_fs_type = {
2055         .name           = "fuseblk",
2056         .get_sb         = fuse_get_sb_blk,
2057         .kill_sb        = kill_block_super,
2058 -       .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
2059 +       .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_IS_FUSE,
2060  };
2061  
2062  static inline int register_fuseblk(void)
2063 diff --git a/fs/ioctl.c b/fs/ioctl.c
2064 index f32fbde..9014fe4 100644
2065 --- a/fs/ioctl.c
2066 +++ b/fs/ioctl.c
2067 @@ -211,3 +211,4 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
2068   out:
2069         return error;
2070  }
2071 +EXPORT_SYMBOL(sys_ioctl);
2072 diff --git a/fs/namei.c b/fs/namei.c
2073 index 8cf9bb9..c1abd13 100644
2074 --- a/fs/namei.c
2075 +++ b/fs/namei.c
2076 @@ -2177,6 +2177,8 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2077         if (!dir->i_op || !dir->i_op->unlink)
2078                 return -EPERM;
2079  
2080 +       vfs_check_frozen(dir->i_sb, SB_FREEZE_WRITE);
2081 +
2082         DQUOT_INIT(dir);
2083  
2084         mutex_lock(&dentry->d_inode->i_mutex);
2085 diff --git a/include/asm-powerpc/suspend.h b/include/asm-powerpc/suspend.h
2086 index cbf2c94..e0756c2 100644
2087 --- a/include/asm-powerpc/suspend.h
2088 +++ b/include/asm-powerpc/suspend.h
2089 @@ -6,4 +6,7 @@ static inline int arch_prepare_suspend(void) { return 0; }
2090  void save_processor_state(void);
2091  void restore_processor_state(void);
2092  
2093 +#define toi_faulted (0)
2094 +#define clear_toi_fault() do { } while (0)
2095 +
2096  #endif /* __ASM_POWERPC_SUSPEND_H */
2097 diff --git a/include/asm-ppc/suspend.h b/include/asm-ppc/suspend.h
2098 index 3df9f32..1e2e73d 100644
2099 --- a/include/asm-ppc/suspend.h
2100 +++ b/include/asm-ppc/suspend.h
2101 @@ -10,3 +10,6 @@ static inline void save_processor_state(void)
2102  static inline void restore_processor_state(void)
2103  {
2104  }
2105 +
2106 +#define toi_faulted (0)
2107 +#define clear_toi_fault() do { } while (0)
2108 diff --git a/include/asm-x86/suspend_32.h b/include/asm-x86/suspend_32.h
2109 index 1bbda3a..e58d1b5 100644
2110 --- a/include/asm-x86/suspend_32.h
2111 +++ b/include/asm-x86/suspend_32.h
2112 @@ -8,6 +8,9 @@
2113  
2114  static inline int arch_prepare_suspend(void) { return 0; }
2115  
2116 +extern int toi_faulted;
2117 +#define clear_toi_fault() do { toi_faulted = 0; } while (0)
2118 +
2119  /* image of the saved processor state */
2120  struct saved_context {
2121         u16 es, fs, gs, ss;
2122 diff --git a/include/asm-x86/suspend_64.h b/include/asm-x86/suspend_64.h
2123 index 2eb92cb..2d429e7 100644
2124 --- a/include/asm-x86/suspend_64.h
2125 +++ b/include/asm-x86/suspend_64.h
2126 @@ -15,6 +15,9 @@ arch_prepare_suspend(void)
2127         return 0;
2128  }
2129  
2130 +#define toi_faulted (0)
2131 +#define clear_toi_fault() do { } while (0)
2132 +
2133  /*
2134   * Image of the saved processor state, used by the low level ACPI suspend to
2135   * RAM code and by the low level hibernation code.
2136 diff --git a/include/linux/Kbuild b/include/linux/Kbuild
2137 index cedbbd8..7a731ba 100644
2138 --- a/include/linux/Kbuild
2139 +++ b/include/linux/Kbuild
2140 @@ -202,6 +202,7 @@ unifdef-y += filter.h
2141  unifdef-y += flat.h
2142  unifdef-y += futex.h
2143  unifdef-y += fs.h
2144 +unifdef-y += freezer.h
2145  unifdef-y += gameport.h
2146  unifdef-y += generic_serial.h
2147  unifdef-y += gfs2_ondisk.h
2148 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
2149 index 932eb02..1436765 100644
2150 --- a/include/linux/buffer_head.h
2151 +++ b/include/linux/buffer_head.h
2152 @@ -172,6 +172,11 @@ wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
2153  int fsync_bdev(struct block_device *);
2154  struct super_block *freeze_bdev(struct block_device *);
2155  void thaw_bdev(struct block_device *, struct super_block *);
2156 +#define FS_FREEZER_FUSE 1
2157 +#define FS_FREEZER_NORMAL 2
2158 +#define FS_FREEZER_ALL (FS_FREEZER_FUSE | FS_FREEZER_NORMAL)
2159 +void freeze_filesystems(int which);
2160 +void thaw_filesystems(int which);
2161  int fsync_super(struct super_block *);
2162  int fsync_no_super(struct block_device *);
2163  struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
2164 diff --git a/include/linux/dyn_pageflags.h b/include/linux/dyn_pageflags.h
2165 new file mode 100644
2166 index 0000000..e85c3ee
2167 --- /dev/null
2168 +++ b/include/linux/dyn_pageflags.h
2169 @@ -0,0 +1,66 @@
2170 +/*
2171 + * include/linux/dyn_pageflags.h
2172 + *
2173 + * Copyright (C) 2004-2007 Nigel Cunningham <nigel at tuxonice net>
2174 + *
2175 + * This file is released under the GPLv2.
2176 + *
2177 + * It implements support for dynamically allocated bitmaps that are
2178 + * used for temporary or infrequently used pageflags, in lieu of
2179 + * bits in the struct page flags entry.
2180 + */
2181 +
2182 +#ifndef DYN_PAGEFLAGS_H
2183 +#define DYN_PAGEFLAGS_H
2184 +
2185 +#include <linux/mm.h>
2186 +
2187 +struct dyn_pageflags {
2188 +       unsigned long ****bitmap; /* [pg_dat][zone][page_num] */
2189 +       int sparse, initialised;
2190 +       struct list_head list;
2191 +       spinlock_t struct_lock;
2192 +};
2193 +
2194 +#define DYN_PAGEFLAGS_INIT(name) { \
2195 +       .list = LIST_HEAD_INIT(name.list), \
2196 +       .struct_lock = __SPIN_LOCK_UNLOCKED(name.lock) \
2197 +}
2198 +
2199 +#define DECLARE_DYN_PAGEFLAGS(name) \
2200 +       struct dyn_pageflags name = DYN_PAGEFLAGS_INIT(name);
2201 +
2202 +#define BITMAP_FOR_EACH_SET(BITMAP, CTR) \
2203 +       for (CTR = get_next_bit_on(BITMAP, max_pfn + 1); CTR <= max_pfn; \
2204 +               CTR = get_next_bit_on(BITMAP, CTR))
2205 +
2206 +extern void clear_dyn_pageflags(struct dyn_pageflags *pagemap);
2207 +extern int allocate_dyn_pageflags(struct dyn_pageflags *pagemap, int sparse);
2208 +extern void free_dyn_pageflags(struct dyn_pageflags *pagemap);
2209 +extern unsigned long get_next_bit_on(struct dyn_pageflags *bitmap,
2210 +       unsigned long counter);
2211 +
2212 +extern int test_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
2213 +/*
2214 + * In sparse bitmaps, setting a flag can fail (we can fail to allocate
2215 + * the page to store the bit. If this happens, we will BUG(). If you don't
2216 + * want this behaviour, don't allocate sparse pageflags.
2217 + */
2218 +extern void set_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
2219 +extern void clear_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
2220 +extern void dump_pagemap(struct dyn_pageflags *pagemap);
2221 +
2222 +/*
2223 + * With the above macros defined, you can do...
2224 + * #define PagePageset1(page) (test_dynpageflag(&pageset1_map, page))
2225 + * #define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page))
2226 + * #define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page))
2227 + */
2228 +
2229 +extern void __init dyn_pageflags_init(void);
2230 +extern void __init dyn_pageflags_use_kzalloc(void);
2231 +
2232 +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
2233 +extern void dyn_pageflags_hotplug(struct zone *zone);
2234 +#endif
2235 +#endif
2236 diff --git a/include/linux/freezer.h b/include/linux/freezer.h
2237 index 0893499..01e9dc6 100644
2238 --- a/include/linux/freezer.h
2239 +++ b/include/linux/freezer.h
2240 @@ -127,6 +127,19 @@ static inline void set_freezable(void)
2241         current->flags &= ~PF_NOFREEZE;
2242  }
2243  
2244 +extern int freezer_state;
2245 +#define FREEZER_OFF 0
2246 +#define FREEZER_FILESYSTEMS_FROZEN 1
2247 +#define FREEZER_USERSPACE_FROZEN 2
2248 +#define FREEZER_FULLY_ON 3
2249 +
2250 +static inline int freezer_is_on(void)
2251 +{
2252 +       return (freezer_state == FREEZER_FULLY_ON);
2253 +}
2254 +
2255 +extern void thaw_kernel_threads(void);
2256 +
2257  /*
2258   * Freezer-friendly wrappers around wait_event_interruptible() and
2259   * wait_event_interruptible_timeout(), originally defined in <linux/wait.h>
2260 @@ -169,6 +182,8 @@ static inline int freeze_processes(void) { BUG(); return 0; }
2261  static inline void thaw_processes(void) {}
2262  
2263  static inline int try_to_freeze(void) { return 0; }
2264 +static inline int freezer_is_on(void) { return 0; }
2265 +static inline void thaw_kernel_threads(void) { }
2266  
2267  static inline void freezer_do_not_count(void) {}
2268  static inline void freezer_count(void) {}
2269 diff --git a/include/linux/fs.h b/include/linux/fs.h
2270 index b84b848..7c14b03 100644
2271 --- a/include/linux/fs.h
2272 +++ b/include/linux/fs.h
2273 @@ -8,6 +8,7 @@
2274  
2275  #include <linux/limits.h>
2276  #include <linux/ioctl.h>
2277 +#include <linux/freezer.h>
2278  
2279  /*
2280   * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
2281 @@ -93,6 +94,7 @@ extern int dir_notify_enable;
2282  #define FS_REQUIRES_DEV 1 
2283  #define FS_BINARY_MOUNTDATA 2
2284  #define FS_HAS_SUBTYPE 4
2285 +#define FS_IS_FUSE     8       /* Fuse filesystem - bdev freeze these too */
2286  #define FS_REVAL_DOT   16384   /* Check the paths ".", ".." for staleness */
2287  #define FS_RENAME_DOES_D_MOVE  32768   /* FS will handle d_move()
2288                                          * during rename() internally.
2289 @@ -125,6 +127,7 @@ extern int dir_notify_enable;
2290  #define MS_RELATIME    (1<<21) /* Update atime relative to mtime/ctime. */
2291  #define MS_KERNMOUNT   (1<<22) /* this is a kern_mount call */
2292  #define MS_I_VERSION   (1<<23) /* Update inode I_version field */
2293 +#define MS_FROZEN      (1<<24) /* Frozen by freeze_filesystems() */
2294  #define MS_ACTIVE      (1<<30)
2295  #define MS_NOUSER      (1<<31)
2296  
2297 @@ -1057,8 +1060,11 @@ enum {
2298         SB_FREEZE_TRANS = 2,
2299  };
2300  
2301 -#define vfs_check_frozen(sb, level) \
2302 -       wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
2303 +#define vfs_check_frozen(sb, level) do { \
2304 +       freezer_do_not_count(); \
2305 +       wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))); \
2306 +       freezer_count(); \
2307 +} while (0)
2308  
2309  #define get_fs_excl() atomic_inc(&current->fs_excl)
2310  #define put_fs_excl() atomic_dec(&current->fs_excl)
2311 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
2312 index 2df44e7..4d22ae4 100644
2313 --- a/include/linux/kernel.h
2314 +++ b/include/linux/kernel.h
2315 @@ -151,6 +151,8 @@ extern int vsprintf(char *buf, const char *, va_list)
2316         __attribute__ ((format (printf, 2, 0)));
2317  extern int snprintf(char * buf, size_t size, const char * fmt, ...)
2318         __attribute__ ((format (printf, 3, 4)));
2319 +extern int snprintf_used(char *buffer, int buffer_size,
2320 +               const char *fmt, ...);
2321  extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
2322         __attribute__ ((format (printf, 3, 0)));
2323  extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
2324 diff --git a/include/linux/netlink.h b/include/linux/netlink.h
2325 index fb0713b..2e14fde 100644
2326 --- a/include/linux/netlink.h
2327 +++ b/include/linux/netlink.h
2328 @@ -24,6 +24,8 @@
2329  /* leave room for NETLINK_DM (DM Events) */
2330  #define NETLINK_SCSITRANSPORT  18      /* SCSI Transports */
2331  #define NETLINK_ECRYPTFS       19
2332 +#define NETLINK_TOI_USERUI     20      /* TuxOnIce's userui */
2333 +#define NETLINK_TOI_USM                21      /* Userspace storage manager */
2334  
2335  #define MAX_LINKS 32           
2336  
2337 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
2338 index 1d7d4c5..bbbb987 100644
2339 --- a/include/linux/suspend.h
2340 +++ b/include/linux/suspend.h
2341 @@ -255,4 +255,69 @@ static inline void register_nosave_region_late(unsigned long b, unsigned long e)
2342  }
2343  #endif
2344  
2345 +enum {
2346 +       TOI_CAN_HIBERNATE,
2347 +       TOI_CAN_RESUME,
2348 +       TOI_RESUME_DEVICE_OK,
2349 +       TOI_NORESUME_SPECIFIED,
2350 +       TOI_SANITY_CHECK_PROMPT,
2351 +       TOI_CONTINUE_REQ,
2352 +       TOI_RESUMED_BEFORE,
2353 +       TOI_BOOT_TIME,
2354 +       TOI_NOW_RESUMING,
2355 +       TOI_IGNORE_LOGLEVEL,
2356 +       TOI_TRYING_TO_RESUME,
2357 +       TOI_LOADING_ALT_IMAGE,
2358 +       TOI_STOP_RESUME,
2359 +       TOI_IO_STOPPED,
2360 +       TOI_NOTIFIERS_PREPARE,
2361 +       TOI_CLUSTER_MODE,
2362 +};
2363 +
2364 +#ifdef CONFIG_TOI
2365 +
2366 +/* Used in init dir files */
2367 +extern unsigned long toi_state;
2368 +#define set_toi_state(bit) (set_bit(bit, &toi_state))
2369 +#define clear_toi_state(bit) (clear_bit(bit, &toi_state))
2370 +#define test_toi_state(bit) (test_bit(bit, &toi_state))
2371 +extern int toi_running;
2372 +
2373 +#else /* !CONFIG_TOI */
2374 +
2375 +#define toi_state              (0)
2376 +#define set_toi_state(bit) do { } while (0)
2377 +#define clear_toi_state(bit) do { } while (0)
2378 +#define test_toi_state(bit) (0)
2379 +#define toi_running (0)
2380 +#endif /* CONFIG_TOI */
2381 +
2382 +#ifdef CONFIG_HIBERNATION
2383 +#ifdef CONFIG_TOI
2384 +extern void toi_try_resume(void);
2385 +#else
2386 +#define toi_try_resume() do { } while (0)
2387 +#endif
2388 +
2389 +extern int resume_attempted;
2390 +extern int software_resume(void);
2391 +
2392 +static inline void check_resume_attempted(void)
2393 +{
2394 +       if (resume_attempted)
2395 +               return;
2396 +
2397 +       software_resume();
2398 +}
2399 +#else
2400 +#define check_resume_attempted() do { } while (0)
2401 +#define resume_attempted (0)
2402 +#endif
2403 +
2404 +#ifdef CONFIG_PRINTK_NOSAVE
2405 +#define POSS_NOSAVE __nosavedata
2406 +#else
2407 +#define POSS_NOSAVE
2408 +#endif
2409 +
2410  #endif /* _LINUX_SUSPEND_H */
2411 diff --git a/include/linux/swap.h b/include/linux/swap.h
2412 index 878459a..55315f9 100644
2413 --- a/include/linux/swap.h
2414 +++ b/include/linux/swap.h
2415 @@ -164,6 +164,7 @@ extern unsigned long totalram_pages;
2416  extern unsigned long totalreserve_pages;
2417  extern long nr_swap_pages;
2418  extern unsigned int nr_free_buffer_pages(void);
2419 +extern unsigned int nr_unallocated_buffer_pages(void);
2420  extern unsigned int nr_free_pagecache_pages(void);
2421  
2422  /* Definition of global_page_state not available yet */
2423 @@ -187,6 +188,8 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
2424                                                         gfp_t gfp_mask);
2425  extern int __isolate_lru_page(struct page *page, int mode);
2426  extern unsigned long shrink_all_memory(unsigned long nr_pages);
2427 +extern void shrink_one_zone(struct zone *zone, unsigned long desired_size,
2428 +               int ps_wanted);
2429  extern int vm_swappiness;
2430  extern int remove_mapping(struct address_space *mapping, struct page *page);
2431  extern long vm_total_pages;
2432 @@ -356,5 +359,10 @@ static inline swp_entry_t get_swap_page(void)
2433  #define disable_swap_token() do { } while(0)
2434  
2435  #endif /* CONFIG_SWAP */
2436 +
2437 +/* For TuxOnIce - unlink LRU pages while saving separately */
2438 +void unlink_lru_lists(void);
2439 +void relink_lru_lists(void);
2440 +
2441  #endif /* __KERNEL__*/
2442  #endif /* _LINUX_SWAP_H */
2443 diff --git a/init/do_mounts.c b/init/do_mounts.c
2444 index 3885e70..131427c 100644
2445 --- a/init/do_mounts.c
2446 +++ b/init/do_mounts.c
2447 @@ -373,6 +373,8 @@ void __init prepare_namespace(void)
2448         if (is_floppy && rd_doload && rd_load_disk(0))
2449                 ROOT_DEV = Root_RAM0;
2450  
2451 +       check_resume_attempted();
2452 +
2453         mount_root();
2454  out:
2455         sys_mount(".", "/", NULL, MS_MOVE, NULL);
2456 diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
2457 index 614241b..f3ea292 100644
2458 --- a/init/do_mounts_initrd.c
2459 +++ b/init/do_mounts_initrd.c
2460 @@ -6,6 +6,7 @@
2461  #include <linux/romfs_fs.h>
2462  #include <linux/initrd.h>
2463  #include <linux/sched.h>
2464 +#include <linux/suspend.h>
2465  #include <linux/freezer.h>
2466  
2467  #include "do_mounts.h"
2468 @@ -68,6 +69,11 @@ static void __init handle_initrd(void)
2469  
2470         current->flags &= ~PF_FREEZER_SKIP;
2471  
2472 +       if (!resume_attempted)
2473 +               printk(KERN_ERR "TuxOnIce: No attempt was made to resume from "
2474 +                               "any image that might exist.\n");
2475 +       clear_toi_state(TOI_BOOT_TIME);
2476 +
2477         /* move initrd to rootfs' /old */
2478         sys_fchdir(old_fd);
2479         sys_mount("/", ".", NULL, MS_MOVE, NULL);
2480 diff --git a/init/main.c b/init/main.c
2481 index 99ce949..f4f8330 100644
2482 --- a/init/main.c
2483 +++ b/init/main.c
2484 @@ -56,6 +56,7 @@
2485  #include <linux/pid_namespace.h>
2486  #include <linux/device.h>
2487  #include <linux/kthread.h>
2488 +#include <linux/dyn_pageflags.h>
2489  #include <linux/sched.h>
2490  #include <linux/signal.h>
2491  
2492 @@ -572,6 +573,7 @@ asmlinkage void __init start_kernel(void)
2493         softirq_init();
2494         timekeeping_init();
2495         time_init();
2496 +       dyn_pageflags_init();
2497         profile_init();
2498         if (!irqs_disabled())
2499                 printk("start_kernel(): bug: interrupts were enabled early\n");
2500 @@ -610,6 +612,7 @@ asmlinkage void __init start_kernel(void)
2501         enable_debug_pagealloc();
2502         cpu_hotplug_init();
2503         kmem_cache_init();
2504 +       dyn_pageflags_use_kzalloc();
2505         setup_per_cpu_pageset();
2506         numa_policy_init();
2507         if (late_time_init)
2508 diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
2509 index 6233f3b..8bd74dc 100644
2510 --- a/kernel/power/Kconfig
2511 +++ b/kernel/power/Kconfig
2512 @@ -44,6 +44,18 @@ config PM_VERBOSE
2513         ---help---
2514         This option enables verbose messages from the Power Management code.
2515  
2516 +config PRINTK_NOSAVE
2517 +       depends on PM && PM_DEBUG
2518 +       bool "Preserve printk data from boot kernel when resuming."
2519 +       default n
2520 +       ---help---
2521 +       This option gives printk data and the associated variables the
2522 +       attribute __nosave, which means that they will not be saved as
2523 +       part of the image. The net effect is that after resuming, your
2524 +       dmesg will show the messages from prior to the atomic restore,
2525 +       instead of the messages from the resumed kernel. This may be
2526 +       useful for debugging hibernation.
2527 +
2528  config CAN_PM_TRACE
2529         def_bool y
2530         depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
2531 @@ -178,6 +190,258 @@ config PM_STD_PARTITION
2532           suspended image to. It will simply pick the first available swap 
2533           device.
2534  
2535 +menuconfig TOI_CORE
2536 +       tristate "Enhanced Hibernation (TuxOnIce)"
2537 +       depends on HIBERNATION
2538 +       default y
2539 +       ---help---
2540 +         TuxOnIce is the 'new and improved' suspend support.
2541 +         
2542 +         See the TuxOnIce home page (tuxonice.net)
2543 +         for FAQs, HOWTOs and other documentation.
2544 +
2545 +       comment "Image Storage (you need at least one allocator)"
2546 +               depends on TOI_CORE
2547 +       
2548 +       config TOI_FILE
2549 +               tristate "File Allocator"
2550 +               depends on TOI_CORE
2551 +               default y
2552 +               ---help---
2553 +                 This option enables support for storing an image in a
2554 +                 simple file. This should be possible, but we're still
2555 +                 testing it.
2556 +
2557 +       config TOI_SWAP
2558 +               tristate "Swap Allocator"
2559 +               depends on TOI_CORE && SWAP
2560 +               default y
2561 +               ---help---
2562 +                 This option enables support for storing an image in your
2563 +                 swap space.
2564 +
2565 +       comment "General Options"
2566 +               depends on TOI_CORE
2567 +
2568 +       config TOI_DEFAULT_PRE_HIBERNATE
2569 +               string "Default pre-hibernate command"
2570 +               depends on TOI_CORE
2571 +               ---help---
2572 +                 This entry allows you to specify a command to be run prior
2573 +                 to starting a hibernation cycle. If this command returns
2574 +                 a non-zero result code, hibernating will be aborted. If
2575 +                 you're starting hibernation via the hibernate script,
2576 +                 this value should probably be blank.
2577 +
2578 +       config TOI_DEFAULT_POST_HIBERNATE
2579 +               string "Default post-resume command"
2580 +               depends on TOI_CORE
2581 +               ---help---
2582 +                 This entry allows you to specify a command to be run after
2583 +                 completing a hibernation cycle. The return code of this
2584 +                 command is ignored. If you're starting hibernation via the
2585 +                 hibernate script, this value should probably be blank.
2586 +
2587 +       config TOI_CRYPTO
2588 +               tristate "Compression support"
2589 +               depends on TOI_CORE && CRYPTO
2590 +               default y
2591 +               ---help---
2592 +                 This option adds support for using cryptoapi compression
2593 +                 algorithms. Compression is particularly useful as
2594 +                 the LZF support that comes with the TuxOnIce patch can double
2595 +                 your suspend and resume speed.
2596 +
2597 +                 You probably want this, so say Y here.
2598 +
2599 +       comment "No compression support available without Cryptoapi support."
2600 +               depends on TOI_CORE && !CRYPTO
2601 +
2602 +       config TOI_USERUI
2603 +               tristate "Userspace User Interface support"
2604 +               depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
2605 +               default y
2606 +               ---help---
2607 +                 This option enabled support for a userspace based user interface
2608 +                 to TuxOnIce, which allows you to have a nice display while suspending
2609 +                 and resuming, and also enables features such as pressing escape to
2610 +                 cancel a cycle or interactive debugging.
2611 +
2612 +       config TOI_USERUI_DEFAULT_PATH
2613 +               string "Default userui program location"
2614 +               default "/usr/local/sbin/tuxonice_fbsplash"
2615 +               depends on TOI_USERUI
2616 +               ---help---
2617 +                 This entry allows you to specify a default path to the userui binary.
2618 +
2619 +       config TOI_KEEP_IMAGE
2620 +               bool "Allow Keep Image Mode"
2621 +               depends on TOI_CORE
2622 +               ---help---
2623 +                 This option allows you to keep and image and reuse it. It is intended
2624 +                 __ONLY__ for use with systems where all filesystems are mounted read-
2625 +                 only (kiosks, for example). To use it, compile this option in and boot
2626 +                 normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
2627 +                 When you resume, the image will not be removed. You will be unable to turn
2628 +                 off swap partitions (assuming you are using the swap allocator), but future
2629 +                 suspends simply do a power-down. The image can be updated using the
2630 +                 kernel command line parameter suspend_act= to turn off the keep image
2631 +                 bit. Keep image mode is a little less user friendly on purpose - it
2632 +                 should not be used without thought!
2633 +
2634 +       config TOI_REPLACE_SWSUSP
2635 +               bool "Replace swsusp by default"
2636 +               default y
2637 +               depends on TOI_CORE
2638 +               ---help---
2639 +                 TuxOnIce can replace swsusp. This option makes that the default state,
2640 +                 requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
2641 +                 to use the vanilla kernel functionality. Note that your initrd/ramfs will
2642 +                 need to do this before trying to resume, too.
2643 +                 With overriding swsusp enabled, echoing disk  to /sys/power/state will
2644 +                 start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
2645 +                 the swap and file allocators are compiled in, the swap allocator will be
2646 +                 used by default.
2647 +
2648 +       menuconfig TOI_CLUSTER
2649 +               tristate "Cluster support"
2650 +               default n
2651 +               depends on TOI_CORE && NET && BROKEN
2652 +               ---help---
2653 +                 Support for linking multiple machines in a cluster so that they suspend
2654 +                 and resume together.
2655 +
2656 +       config TOI_DEFAULT_CLUSTER_INTERFACE
2657 +               string "Default cluster interface"
2658 +               depends on TOI_CLUSTER
2659 +               ---help---
2660 +                 The default interface on which to communicate with other nodes in
2661 +                 the cluster.
2662 +                 
2663 +                 If no value is set here, cluster support will be disabled by default.
2664 +
2665 +       config TOI_DEFAULT_CLUSTER_KEY
2666 +               string "Default cluster key"
2667 +               default "Default"
2668 +               depends on TOI_CLUSTER
2669 +               ---help---
2670 +                 The default key used by this node. All nodes in the same cluster
2671 +                 have the same key. Multiple clusters may coexist on the same lan
2672 +                 by using different values for this key.
2673 +
2674 +       config TOI_CLUSTER_IMAGE_TIMEOUT
2675 +               int "Timeout when checking for image"
2676 +               default 15
2677 +               depends on TOI_CLUSTER
2678 +               ---help---
2679 +                 Timeout (seconds) before continuing to boot when waiting to see
2680 +                 whether other nodes might have an image. Set to -1 to wait
2681 +                 indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
2682 +                 booting sooner than this timeout.
2683 +
2684 +       config TOI_CLUSTER_WAIT_UNTIL_NODES
2685 +               int "Nodes without image before continuing"
2686 +               default 0
2687 +               depends on TOI_CLUSTER
2688 +               ---help---
2689 +                 When booting and no image is found, we wait to see if other nodes
2690 +                 have an image before continuing to boot. This value lets us
2691 +                 continue after seeing a certain number of nodes without an image,
2692 +                 instead of continuing to wait for the timeout. Set to 0 to only
2693 +                 use the timeout.
2694 +
2695 +       config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
2696 +               string "Default pre-hibernate script"
2697 +               depends on TOI_CLUSTER
2698 +               ---help---
2699 +                 The default script to be called when starting to hibernate.
2700 +
2701 +       config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
2702 +               string "Default post-hibernate script"
2703 +               depends on TOI_CLUSTER
2704 +               ---help---
2705 +                 The default script to be called after resuming from hibernation.
2706 +
2707 +       config TOI_CHECKSUM
2708 +               bool "Checksum pageset2"
2709 +               default y
2710 +               depends on TOI_CORE
2711 +               select CRYPTO
2712 +               select CRYPTO_ALGAPI
2713 +               select CRYPTO_MD4
2714 +               ---help---
2715 +                 Adds support for checksumming pageset2 pages, to ensure you really get an
2716 +                 atomic copy. Since some filesystems (XFS especially) change metadata even
2717 +                 when there's no other activity, we need this to check for pages that have
2718 +                 been changed while we were saving the page cache. If your debugging output
2719 +                 always says no pages were resaved, you may be able to safely disable this
2720 +                 option.
2721 +
2722 +       config TOI_DEFAULT_WAIT
2723 +               int "Default waiting time for emergency boot messages"
2724 +               default "25"
2725 +               range -1 32768
2726 +               depends on TOI_CORE
2727 +               help
2728 +                 TuxOnIce can display warnings very early in the process of resuming,
2729 +                 if (for example) it appears that you have booted a kernel that doesn't
2730 +                 match an image on disk. It can then give you the opportunity to either
2731 +                 continue booting that kernel, or reboot the machine. This option can be
2732 +                 used to control how long to wait in such circumstances. -1 means wait
2733 +                 forever. 0 means don't wait at all (do the default action, which will
2734 +                 generally be to continue booting and remove the image). Values of 1 or
2735 +                 more indicate a number of seconds (up to 255) to wait before doing the
2736 +                 default.
2737 +
2738 +       config  TOI_PAGEFLAGS_TEST
2739 +               tristate "Test pageflags"
2740 +               default N
2741 +               depends on TOI_CORE
2742 +               help
2743 +                 Test pageflags.
2744 +
2745 +config TOI_PAGEFLAGS_EXPORTS
2746 +       bool
2747 +       depends on TOI_PAGEFLAGS_TEST=m
2748 +       default y
2749 +
2750 +config TOI_USERUI_EXPORTS
2751 +       bool
2752 +       depends on TOI_USERUI=m
2753 +       default y
2754 +
2755 +config TOI_SWAP_EXPORTS
2756 +       bool
2757 +       depends on TOI_SWAP=m
2758 +       default y
2759 +
2760 +config TOI_FILE_EXPORTS
2761 +       bool
2762 +       depends on TOI_FILE=m
2763 +       default y
2764 +
2765 +config TOI_CRYPTO_EXPORTS
2766 +       bool
2767 +       depends on TOI_CRYPTO=m
2768 +       default y
2769 +
2770 +config TOI_CORE_EXPORTS
2771 +       bool
2772 +       depends on TOI_CORE=m
2773 +       default y
2774 +
2775 +config TOI_EXPORTS
2776 +       bool
2777 +       depends on TOI_SWAP_EXPORTS || TOI_FILE_EXPORTS || \
2778 +               TOI_CRYPTO_EXPORTS || TOI_CLUSTER=m || \
2779 +               TOI_USERUI_EXPORTS || TOI_PAGEFLAGS_EXPORTS
2780 +       default y
2781 +
2782 +config TOI
2783 +       bool
2784 +       depends on TOI_CORE!=n
2785 +       default y
2786 +
2787  config APM_EMULATION
2788         tristate "Advanced Power Management Emulation"
2789         depends on PM && SYS_SUPPORTS_APM_EMULATION
2790 diff --git a/kernel/power/Makefile b/kernel/power/Makefile
2791 index f7dfff2..8ea53fa 100644
2792 --- a/kernel/power/Makefile
2793 +++ b/kernel/power/Makefile
2794 @@ -5,6 +5,37 @@ endif
2795  
2796  obj-y                          := main.o
2797  obj-$(CONFIG_PM_LEGACY)                += pm.o
2798 +
2799 +tuxonice_core-objs := tuxonice_modules.o tuxonice_sysfs.o tuxonice_highlevel.o \
2800 +               tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
2801 +               tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
2802 +               tuxonice_power_off.o tuxonice_atomic_copy.o
2803 +
2804 +obj-$(CONFIG_TOI)              += tuxonice_builtin.o
2805 +
2806 +ifdef CONFIG_PM_DEBUG
2807 +tuxonice_core-objs             += tuxonice_alloc.o
2808 +endif
2809 +
2810 +ifdef CONFIG_TOI_CHECKSUM
2811 +tuxonice_core-objs             += tuxonice_checksum.o
2812 +endif
2813 +
2814 +ifdef CONFIG_NET
2815 +tuxonice_core-objs             += tuxonice_storage.o tuxonice_netlink.o
2816 +endif
2817 +
2818 +obj-$(CONFIG_TOI_CORE)         += tuxonice_core.o
2819 +obj-$(CONFIG_TOI_CRYPTO)       += tuxonice_compress.o
2820 +
2821 +obj-$(CONFIG_TOI_SWAP)         += tuxonice_block_io.o tuxonice_swap.o
2822 +obj-$(CONFIG_TOI_FILE)         += tuxonice_block_io.o tuxonice_file.o
2823 +obj-$(CONFIG_TOI_CLUSTER)      += tuxonice_cluster.o
2824 +
2825 +obj-$(CONFIG_TOI_USERUI)       += tuxonice_userui.o
2826 +
2827 +obj-$(CONFIG_TOI_PAGEFLAGS_TEST)       += toi_pageflags_test.o
2828 +
2829  obj-$(CONFIG_PM_SLEEP)         += process.o console.o
2830  obj-$(CONFIG_HIBERNATION)      += swsusp.o disk.o snapshot.o swap.o user.o
2831  
2832 diff --git a/kernel/power/disk.c b/kernel/power/disk.c
2833 index 14a656c..141606e 100644
2834 --- a/kernel/power/disk.c
2835 +++ b/kernel/power/disk.c
2836 @@ -24,9 +24,11 @@
2837  
2838  #include "power.h"
2839  
2840 +#include "tuxonice.h"
2841 +#include "tuxonice_builtin.h"
2842  
2843  static int noresume = 0;
2844 -static char resume_file[256] = CONFIG_PM_STD_PARTITION;
2845 +char resume_file[256] = CONFIG_PM_STD_PARTITION;
2846  dev_t swsusp_resume_device;
2847  sector_t swsusp_resume_block;
2848  
2849 @@ -104,7 +106,7 @@ static int hibernation_test(int level) { return 0; }
2850   *     hibernation
2851   */
2852  
2853 -static int platform_begin(int platform_mode)
2854 +int platform_begin(int platform_mode)
2855  {
2856         return (platform_mode && hibernation_ops) ?
2857                 hibernation_ops->begin() : 0;
2858 @@ -115,7 +117,7 @@ static int platform_begin(int platform_mode)
2859   *     working state
2860   */
2861  
2862 -static void platform_end(int platform_mode)
2863 +void platform_end(int platform_mode)
2864  {
2865         if (platform_mode && hibernation_ops)
2866                 hibernation_ops->end();
2867 @@ -126,7 +128,7 @@ static void platform_end(int platform_mode)
2868   *     platform driver if so configured and return an error code if it fails
2869   */
2870  
2871 -static int platform_pre_snapshot(int platform_mode)
2872 +int platform_pre_snapshot(int platform_mode)
2873  {
2874         return (platform_mode && hibernation_ops) ?
2875                 hibernation_ops->pre_snapshot() : 0;
2876 @@ -137,7 +139,7 @@ static int platform_pre_snapshot(int platform_mode)
2877   *     of operation using the platform driver (called with interrupts disabled)
2878   */
2879  
2880 -static void platform_leave(int platform_mode)
2881 +void platform_leave(int platform_mode)
2882  {
2883         if (platform_mode && hibernation_ops)
2884                 hibernation_ops->leave();
2885 @@ -148,7 +150,7 @@ static void platform_leave(int platform_mode)
2886   *     using the platform driver (must be called after platform_prepare())
2887   */
2888  
2889 -static void platform_finish(int platform_mode)
2890 +void platform_finish(int platform_mode)
2891  {
2892         if (platform_mode && hibernation_ops)
2893                 hibernation_ops->finish();
2894 @@ -160,7 +162,7 @@ static void platform_finish(int platform_mode)
2895   *     called, platform_restore_cleanup() must be called.
2896   */
2897  
2898 -static int platform_pre_restore(int platform_mode)
2899 +int platform_pre_restore(int platform_mode)
2900  {
2901         return (platform_mode && hibernation_ops) ?
2902                 hibernation_ops->pre_restore() : 0;
2903 @@ -173,7 +175,7 @@ static int platform_pre_restore(int platform_mode)
2904   *     regardless of the result of platform_pre_restore().
2905   */
2906  
2907 -static void platform_restore_cleanup(int platform_mode)
2908 +void platform_restore_cleanup(int platform_mode)
2909  {
2910         if (platform_mode && hibernation_ops)
2911                 hibernation_ops->restore_cleanup();
2912 @@ -477,6 +479,11 @@ int hibernate(void)
2913  {
2914         int error;
2915  
2916 +#ifdef CONFIG_TOI
2917 +       if (test_action_state(TOI_REPLACE_SWSUSP))
2918 +               return toi_try_hibernate(1);
2919 +#endif
2920 +
2921         mutex_lock(&pm_mutex);
2922         /* The snapshot device should not be opened while we're running */
2923         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
2924 @@ -549,10 +556,21 @@ int hibernate(void)
2925   *
2926   */
2927  
2928 -static int software_resume(void)
2929 +int software_resume(void)
2930  {
2931         int error;
2932         unsigned int flags;
2933 +       resume_attempted = 1;
2934 +
2935 +#ifdef CONFIG_TOI
2936 +       /*
2937 +        * We can't know (until an image header - if any - is loaded), whether
2938 +        * we did override swsusp. We therefore ensure that both are tried.
2939 +        */
2940 +       if (test_action_state(TOI_REPLACE_SWSUSP))
2941 +               printk(KERN_INFO "Replacing swsusp.\n");
2942 +               toi_try_resume();
2943 +#endif
2944  
2945         /*
2946          * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
2947 @@ -565,6 +583,7 @@ static int software_resume(void)
2948          * here to avoid lockdep complaining.
2949          */
2950         mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
2951 +
2952         if (!swsusp_resume_device) {
2953                 if (!strlen(resume_file)) {
2954                         mutex_unlock(&pm_mutex);
2955 @@ -636,9 +655,6 @@ static int software_resume(void)
2956         return error;
2957  }
2958  
2959 -late_initcall(software_resume);
2960 -
2961 -
2962  static const char * const hibernation_modes[] = {
2963         [HIBERNATION_PLATFORM]  = "platform",
2964         [HIBERNATION_SHUTDOWN]  = "shutdown",
2965 @@ -851,6 +867,7 @@ static int __init resume_offset_setup(char *str)
2966  static int __init noresume_setup(char *str)
2967  {
2968         noresume = 1;
2969 +       set_toi_state(TOI_NORESUME_SPECIFIED);
2970         return 1;
2971  }
2972  
2973 diff --git a/kernel/power/power.h b/kernel/power/power.h
2974 index 700f44e..fdc558a 100644
2975 --- a/kernel/power/power.h
2976 +++ b/kernel/power/power.h
2977 @@ -1,7 +1,16 @@
2978 +/*
2979 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
2980 + */
2981 +
2982 +#ifndef KERNEL_POWER_POWER_H
2983 +#define KERNEL_POWER_POWER_H
2984 +
2985  #include <linux/suspend.h>
2986  #include <linux/suspend_ioctls.h>
2987  #include <linux/utsname.h>
2988  #include <linux/freezer.h>
2989 +#include "tuxonice.h"
2990 +#include "tuxonice_builtin.h"
2991  
2992  struct swsusp_info {
2993         struct new_utsname      uts;
2994 @@ -21,18 +30,22 @@ struct swsusp_info {
2995  extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
2996  extern int arch_hibernation_header_restore(void *addr);
2997  
2998 -static inline int init_header_complete(struct swsusp_info *info)
2999 +static inline int init_swsusp_header_complete(struct swsusp_info *info)
3000  {
3001         return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
3002  }
3003  
3004 -static inline char *check_image_kernel(struct swsusp_info *info)
3005 +static inline char *check_swsusp_image_kernel(struct swsusp_info *info)
3006  {
3007         return arch_hibernation_header_restore(info) ?
3008                         "architecture specific data" : NULL;
3009  }
3010 +#else
3011 +extern char *check_swsusp_image_kernel(struct swsusp_info *info);
3012  #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
3013 +extern int init_swsusp_header(struct swsusp_info *info);
3014  
3015 +extern char resume_file[256];
3016  /*
3017   * Keep some memory free so that I/O operations can succeed without paging
3018   * [Might this be more than 4 MB?]
3019 @@ -65,6 +78,8 @@ static struct kobj_attribute _name##_attr = { \
3020         .store  = _name##_store,                \
3021  }
3022  
3023 +extern struct pbe *restore_pblist;
3024 +
3025  /* Preferred image size in bytes (default 500 MB) */
3026  extern unsigned long image_size;
3027  extern int in_suspend;
3028 @@ -225,3 +240,26 @@ static inline void suspend_thaw_processes(void)
3029  {
3030  }
3031  #endif
3032 +
3033 +extern struct page *saveable_page(unsigned long pfn);
3034 +#ifdef CONFIG_HIGHMEM
3035 +extern struct page *saveable_highmem_page(unsigned long pfn);
3036 +#else
3037 +static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
3038 +#endif
3039 +
3040 +#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
3041 +extern struct list_head nosave_regions;
3042 +
3043 +/**
3044 + *     This structure represents a range of page frames the contents of which
3045 + *     should not be saved during the suspend.
3046 + */
3047 +
3048 +struct nosave_region {
3049 +       struct list_head list;
3050 +       unsigned long start_pfn;
3051 +       unsigned long end_pfn;
3052 +};
3053 +
3054 +#endif
3055 diff --git a/kernel/power/process.c b/kernel/power/process.c
3056 index f1d0b34..b835412 100644
3057 --- a/kernel/power/process.c
3058 +++ b/kernel/power/process.c
3059 @@ -13,6 +13,10 @@
3060  #include <linux/module.h>
3061  #include <linux/syscalls.h>
3062  #include <linux/freezer.h>
3063 +#include <linux/buffer_head.h>
3064 +
3065 +int freezer_state;
3066 +EXPORT_SYMBOL(freezer_state);
3067  
3068  /* 
3069   * Timeout for stopping processes
3070 @@ -74,6 +78,7 @@ void refrigerator(void)
3071         pr_debug("%s left refrigerator\n", current->comm);
3072         __set_current_state(save);
3073  }
3074 +EXPORT_SYMBOL(refrigerator);
3075  
3076  static void fake_signal_wake_up(struct task_struct *p)
3077  {
3078 @@ -214,7 +219,8 @@ static int try_to_freeze_tasks(int freeze_user_space)
3079                 do_each_thread(g, p) {
3080                         task_lock(p);
3081                         if (freezing(p) && !freezer_should_skip(p))
3082 -                               printk(KERN_ERR " %s\n", p->comm);
3083 +                               printk(KERN_ERR " %s (%d) failed to freeze.\n",
3084 +                                               p->comm, p->pid);
3085                         cancel_freezing(p);
3086                         task_unlock(p);
3087                 } while_each_thread(g, p);
3088 @@ -234,17 +240,25 @@ int freeze_processes(void)
3089  {
3090         int error;
3091  
3092 -       printk("Freezing user space processes ... ");
3093 +       printk(KERN_INFO "Stopping fuse filesystems.\n");
3094 +       freeze_filesystems(FS_FREEZER_FUSE);
3095 +       freezer_state = FREEZER_FILESYSTEMS_FROZEN;
3096 +       printk(KERN_INFO "Freezing user space processes ... ");
3097         error = try_to_freeze_tasks(FREEZER_USER_SPACE);
3098         if (error)
3099                 goto Exit;
3100 -       printk("done.\n");
3101 +       printk(KERN_INFO "done.\n");
3102  
3103 -       printk("Freezing remaining freezable tasks ... ");
3104 +       sys_sync();
3105 +       printk(KERN_INFO "Stopping normal filesystems.\n");
3106 +       freeze_filesystems(FS_FREEZER_NORMAL);
3107 +       freezer_state = FREEZER_USERSPACE_FROZEN;
3108 +       printk(KERN_INFO "Freezing remaining freezable tasks ... ");
3109         error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
3110         if (error)
3111                 goto Exit;
3112         printk("done.");
3113 +       freezer_state = FREEZER_FULLY_ON;
3114   Exit:
3115         BUG_ON(in_atomic());
3116         printk("\n");
3117 @@ -270,11 +284,33 @@ static void thaw_tasks(int thaw_user_space)
3118  
3119  void thaw_processes(void)
3120  {
3121 -       printk("Restarting tasks ... ");
3122 -       thaw_tasks(FREEZER_KERNEL_THREADS);
3123 +       int old_state = freezer_state;
3124 +
3125 +       if (old_state == FREEZER_OFF)
3126 +               return;
3127 +
3128 +       /*
3129 +        * Change state beforehand because thawed tasks might submit I/O
3130 +        * immediately.
3131 +        */
3132 +       freezer_state = FREEZER_OFF;
3133 +
3134 +       printk(KERN_INFO "Restarting all filesystems ...\n");
3135 +       thaw_filesystems(FS_FREEZER_ALL);
3136 +
3137 +       printk(KERN_INFO "Restarting tasks ... ");
3138 +
3139 +       if (old_state == FREEZER_FULLY_ON)
3140 +               thaw_tasks(FREEZER_KERNEL_THREADS);
3141         thaw_tasks(FREEZER_USER_SPACE);
3142         schedule();
3143         printk("done.\n");
3144  }
3145  
3146 -EXPORT_SYMBOL(refrigerator);
3147 +void thaw_kernel_threads(void)
3148 +{
3149 +       freezer_state = FREEZER_USERSPACE_FROZEN;
3150 +       printk(KERN_INFO "Restarting normal filesystems.\n");
3151 +       thaw_filesystems(FS_FREEZER_NORMAL);
3152 +       thaw_tasks(FREEZER_KERNEL_THREADS);
3153 +}
3154 diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
3155 index 5f91a07..816b60c 100644
3156 --- a/kernel/power/snapshot.c
3157 +++ b/kernel/power/snapshot.c
3158 @@ -33,6 +33,7 @@
3159  #include <asm/io.h>
3160  
3161  #include "power.h"
3162 +#include "tuxonice_builtin.h"
3163  
3164  static int swsusp_page_is_free(struct page *);
3165  static void swsusp_set_page_forbidden(struct page *);
3166 @@ -44,6 +45,13 @@ static void swsusp_unset_page_forbidden(struct page *);
3167   * directly to their "original" page frames.
3168   */
3169  struct pbe *restore_pblist;
3170 +int resume_attempted;
3171 +EXPORT_SYMBOL_GPL(resume_attempted);
3172 +
3173 +#ifdef CONFIG_TOI
3174 +#include "tuxonice_pagedir.h"
3175 +int toi_post_context_save(void);
3176 +#endif
3177  
3178  /* Pointer to an auxiliary buffer (1 page) */
3179  static void *buffer;
3180 @@ -86,6 +94,11 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
3181  
3182  unsigned long get_safe_page(gfp_t gfp_mask)
3183  {
3184 +#ifdef CONFIG_TOI
3185 +       if (toi_running)
3186 +               return toi_get_nonconflicting_page();
3187 +#endif
3188 +
3189         return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
3190  }
3191  
3192 @@ -607,18 +620,8 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
3193         return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
3194  }
3195  
3196 -/**
3197 - *     This structure represents a range of page frames the contents of which
3198 - *     should not be saved during the suspend.
3199 - */
3200 -
3201 -struct nosave_region {
3202 -       struct list_head list;
3203 -       unsigned long start_pfn;
3204 -       unsigned long end_pfn;
3205 -};
3206 -
3207 -static LIST_HEAD(nosave_regions);
3208 +LIST_HEAD(nosave_regions);
3209 +EXPORT_SYMBOL_GPL(nosave_regions);
3210  
3211  /**
3212   *     register_nosave_region - register a range of page frames the contents
3213 @@ -855,7 +858,7 @@ static unsigned int count_free_highmem_pages(void)
3214   *     and it isn't a part of a free chunk of pages.
3215   */
3216  
3217 -static struct page *saveable_highmem_page(unsigned long pfn)
3218 +struct page *saveable_highmem_page(unsigned long pfn)
3219  {
3220         struct page *page;
3221  
3222 @@ -897,8 +900,6 @@ unsigned int count_highmem_pages(void)
3223         }
3224         return n;
3225  }
3226 -#else
3227 -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
3228  #endif /* CONFIG_HIGHMEM */
3229  
3230  /**
3231 @@ -910,7 +911,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
3232   *     a free chunk of pages.
3233   */
3234  
3235 -static struct page *saveable_page(unsigned long pfn)
3236 +struct page *saveable_page(unsigned long pfn)
3237  {
3238         struct page *page;
3239  
3240 @@ -1244,6 +1245,11 @@ asmlinkage int swsusp_save(void)
3241  {
3242         unsigned int nr_pages, nr_highmem;
3243  
3244 +#ifdef CONFIG_TOI
3245 +       if (toi_running)
3246 +               return toi_post_context_save();
3247 +#endif
3248 +
3249         printk(KERN_INFO "PM: Creating hibernation image: \n");
3250  
3251         drain_local_pages(NULL);
3252 @@ -1284,14 +1290,14 @@ asmlinkage int swsusp_save(void)
3253  }
3254  
3255  #ifndef CONFIG_ARCH_HIBERNATION_HEADER
3256 -static int init_header_complete(struct swsusp_info *info)
3257 +int init_swsusp_header_complete(struct swsusp_info *info)
3258  {
3259         memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
3260         info->version_code = LINUX_VERSION_CODE;
3261         return 0;
3262  }
3263  
3264 -static char *check_image_kernel(struct swsusp_info *info)
3265 +char *check_swsusp_image_kernel(struct swsusp_info *info)
3266  {
3267         if (info->version_code != LINUX_VERSION_CODE)
3268                 return "kernel version";
3269 @@ -1305,6 +1311,7 @@ static char *check_image_kernel(struct swsusp_info *info)
3270                 return "machine";
3271         return NULL;
3272  }
3273 +EXPORT_SYMBOL_GPL(check_swsusp_image_kernel);
3274  #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
3275  
3276  unsigned long snapshot_get_image_size(void)
3277 @@ -1312,7 +1319,7 @@ unsigned long snapshot_get_image_size(void)
3278         return nr_copy_pages + nr_meta_pages + 1;
3279  }
3280  
3281 -static int init_header(struct swsusp_info *info)
3282 +int init_swsusp_header(struct swsusp_info *info)
3283  {
3284         memset(info, 0, sizeof(struct swsusp_info));
3285         info->num_physpages = num_physpages;
3286 @@ -1320,7 +1327,7 @@ static int init_header(struct swsusp_info *info)
3287         info->pages = snapshot_get_image_size();
3288         info->size = info->pages;
3289         info->size <<= PAGE_SHIFT;
3290 -       return init_header_complete(info);
3291 +       return init_swsusp_header_complete(info);
3292  }
3293  
3294  /**
3295 @@ -1376,7 +1383,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
3296         if (!handle->offset) {
3297                 int error;
3298  
3299 -               error = init_header((struct swsusp_info *)buffer);
3300 +               error = init_swsusp_header((struct swsusp_info *)buffer);
3301                 if (error)
3302                         return error;
3303                 handle->buffer = buffer;
3304 @@ -1473,7 +1480,7 @@ static int check_header(struct swsusp_info *info)
3305  {
3306         char *reason;
3307  
3308 -       reason = check_image_kernel(info);
3309 +       reason = check_swsusp_image_kernel(info);
3310         if (!reason && info->num_physpages != num_physpages)
3311                 reason = "memory size";
3312         if (reason) {
3313 diff --git a/kernel/power/toi_pageflags_test.c b/kernel/power/toi_pageflags_test.c
3314 new file mode 100644
3315 index 0000000..381f05b
3316 --- /dev/null
3317 +++ b/kernel/power/toi_pageflags_test.c
3318 @@ -0,0 +1,80 @@
3319 +/*
3320 + * TuxOnIce pageflags tester.
3321 + */
3322 +
3323 +#include "linux/module.h"
3324 +#include "linux/bootmem.h"
3325 +#include "linux/sched.h"
3326 +#include "linux/dyn_pageflags.h"
3327 +
3328 +DECLARE_DYN_PAGEFLAGS(test_map);
3329 +
3330 +static char *bits_on(void)
3331 +{
3332 +       char *page = (char *) get_zeroed_page(GFP_KERNEL);
3333 +       unsigned long index = get_next_bit_on(&test_map, max_pfn + 1);
3334 +       int pos = 0;
3335 +
3336 +       while (index <= max_pfn) {
3337 +               pos += snprintf_used(page + pos, PAGE_SIZE - pos - 1, "%d ",
3338 +                               index);
3339 +               index = get_next_bit_on(&test_map, index);
3340 +       }
3341 +
3342 +       return page;
3343 +}
3344 +
3345 +static __init int do_check(void)
3346 +{
3347 +       unsigned long index;
3348 +       int step = 1, steps = 100;
3349 +
3350 +       allocate_dyn_pageflags(&test_map, 0);
3351 +
3352 +       for (index = 1; index < max_pfn; index++) {
3353 +               char *result;
3354 +               char compare[100];
3355 +
3356 +               if (index > (max_pfn / steps * step)) {
3357 +                       printk(KERN_INFO "%d/%d\r", step, steps);
3358 +                       step++;
3359 +               }
3360 +
3361 +
3362 +               if (!pfn_valid(index))
3363 +                       continue;
3364 +
3365 +               clear_dyn_pageflags(&test_map);
3366 +               set_dynpageflag(&test_map, pfn_to_page(0));
3367 +               set_dynpageflag(&test_map, pfn_to_page(index));
3368 +
3369 +               sprintf(compare, "0 %lu ", index);
3370 +
3371 +               result = bits_on();
3372 +
3373 +               if (strcmp(result, compare)) {
3374 +                       printk(KERN_INFO "Expected \"%s\", got \"%s\"\n",
3375 +                                       result, compare);
3376 +               }
3377 +
3378 +               free_page((unsigned long) result);
3379 +               schedule();
3380 +       }
3381 +
3382 +       free_dyn_pageflags(&test_map);
3383 +       return 0;
3384 +}
3385 +
3386 +#ifdef MODULE
3387 +static __exit void check_unload(void)
3388 +{
3389 +}
3390 +
3391 +module_init(do_check);
3392 +module_exit(check_unload);
3393 +MODULE_AUTHOR("Nigel Cunningham");
3394 +MODULE_DESCRIPTION("Pageflags testing");
3395 +MODULE_LICENSE("GPL");
3396 +#else
3397 +late_initcall(do_check);
3398 +#endif
3399 diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
3400 new file mode 100644
3401 index 0000000..6943b24
3402 --- /dev/null
3403 +++ b/kernel/power/tuxonice.h
3404 @@ -0,0 +1,210 @@
3405 +/*
3406 + * kernel/power/tuxonice.h
3407 + *
3408 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
3409 + *
3410 + * This file is released under the GPLv2.
3411 + *
3412 + * It contains declarations used throughout swsusp.
3413 + *
3414 + */
3415 +
3416 +#ifndef KERNEL_POWER_TOI_H
3417 +#define KERNEL_POWER_TOI_H
3418 +
3419 +#include <linux/delay.h>
3420 +#include <linux/bootmem.h>
3421 +#include <linux/suspend.h>
3422 +#include <linux/dyn_pageflags.h>
3423 +#include <linux/fs.h>
3424 +#include <linux/kmod.h>
3425 +#include <asm/setup.h>
3426 +#include "tuxonice_pageflags.h"
3427 +
3428 +#define TOI_CORE_VERSION "3.0-rc7"
3429 +
3430 +#define MY_BOOT_KERNEL_DATA_VERSION 1
3431 +
3432 +struct toi_boot_kernel_data {
3433 +       int version;
3434 +       int size;
3435 +       unsigned long toi_action;
3436 +       unsigned long toi_debug_state;
3437 +       int toi_default_console_level;
3438 +       int toi_io_time[2][2];
3439 +       char toi_nosave_commandline[COMMAND_LINE_SIZE];
3440 +};
3441 +
3442 +extern struct toi_boot_kernel_data toi_bkd;
3443 +
3444 +/* Location of book kernel data struct in kernel being resumed */
3445 +extern unsigned long boot_kernel_data_buffer;
3446 +
3447 +/*              == Action states ==            */
3448 +
3449 +enum {
3450 +       TOI_REBOOT,
3451 +       TOI_PAUSE,
3452 +       TOI_LOGALL,
3453 +       TOI_CAN_CANCEL,
3454 +       TOI_KEEP_IMAGE,
3455 +       TOI_FREEZER_TEST,
3456 +       TOI_SINGLESTEP,
3457 +       TOI_PAUSE_NEAR_PAGESET_END,
3458 +       TOI_TEST_FILTER_SPEED,
3459 +       TOI_TEST_BIO,
3460 +       TOI_NO_PAGESET2,
3461 +       TOI_PM_PREPARE_CONSOLE,
3462 +       TOI_IGNORE_ROOTFS,
3463 +       TOI_REPLACE_SWSUSP,
3464 +       TOI_PAGESET2_FULL,
3465 +       TOI_ABORT_ON_RESAVE_NEEDED,
3466 +       TOI_NO_MULTITHREADED_IO,
3467 +       TOI_NO_DIRECT_LOAD,
3468 +       TOI_LATE_CPU_HOTPLUG,
3469 +       TOI_GET_MAX_MEM_ALLOCD,
3470 +       TOI_NO_FLUSHER_THREAD,
3471 +};
3472 +
3473 +#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
3474 +#define test_action_state(bit) (test_bit(bit, &toi_bkd.toi_action))
3475 +
3476 +/*              == Result states ==            */
3477 +
3478 +enum {
3479 +       TOI_ABORTED,
3480 +       TOI_ABORT_REQUESTED,
3481 +       TOI_NOSTORAGE_AVAILABLE,
3482 +       TOI_INSUFFICIENT_STORAGE,
3483 +       TOI_FREEZING_FAILED,
3484 +       TOI_KEPT_IMAGE,
3485 +       TOI_WOULD_EAT_MEMORY,
3486 +       TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
3487 +       TOI_PM_SEM,
3488 +       TOI_DEVICE_REFUSED,
3489 +       TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
3490 +       TOI_UNABLE_TO_PREPARE_IMAGE,
3491 +       TOI_FAILED_MODULE_INIT,
3492 +       TOI_FAILED_MODULE_CLEANUP,
3493 +       TOI_FAILED_IO,
3494 +       TOI_OUT_OF_MEMORY,
3495 +       TOI_IMAGE_ERROR,
3496 +       TOI_PLATFORM_PREP_FAILED,
3497 +       TOI_CPU_HOTPLUG_FAILED,
3498 +       TOI_ARCH_PREPARE_FAILED,
3499 +       TOI_RESAVE_NEEDED,
3500 +       TOI_CANT_SUSPEND,
3501 +       TOI_NOTIFIERS_PREPARE_FAILED,
3502 +       TOI_PRE_SNAPSHOT_FAILED,
3503 +       TOI_PRE_RESTORE_FAILED,
3504 +};
3505 +
3506 +extern unsigned long toi_result;
3507 +
3508 +#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
3509 +#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
3510 +                               test_and_set_bit(bit, &toi_result))
3511 +#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
3512 +#define test_result_state(bit) (test_bit(bit, &toi_result))
3513 +
3514 +/*      == Debug sections and levels ==        */
3515 +
3516 +/* debugging levels. */
3517 +enum {
3518 +       TOI_STATUS = 0,
3519 +       TOI_ERROR = 2,
3520 +       TOI_LOW,
3521 +       TOI_MEDIUM,
3522 +       TOI_HIGH,
3523 +       TOI_VERBOSE,
3524 +};
3525 +
3526 +enum {
3527 +       TOI_ANY_SECTION,
3528 +       TOI_EAT_MEMORY,
3529 +       TOI_IO,
3530 +       TOI_HEADER,
3531 +       TOI_WRITER,
3532 +       TOI_MEMORY,
3533 +};
3534 +
3535 +#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
3536 +#define clear_debug_state(bit) \
3537 +       (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
3538 +#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
3539 +
3540 +/*             == Steps in hibernating ==      */
3541 +
3542 +enum {
3543 +       STEP_HIBERNATE_PREPARE_IMAGE,
3544 +       STEP_HIBERNATE_SAVE_IMAGE,
3545 +       STEP_HIBERNATE_POWERDOWN,
3546 +       STEP_RESUME_CAN_RESUME,
3547 +       STEP_RESUME_LOAD_PS1,
3548 +       STEP_RESUME_DO_RESTORE,
3549 +       STEP_RESUME_READ_PS2,
3550 +       STEP_RESUME_GO,
3551 +       STEP_RESUME_ALT_IMAGE,
3552 +       STEP_CLEANUP,
3553 +       STEP_QUIET_CLEANUP
3554 +};
3555 +
3556 +/*             == TuxOnIce states ==
3557 +       (see also include/linux/suspend.h)      */
3558 +
3559 +#define get_toi_state()  (toi_state)
3560 +#define restore_toi_state(saved_state) \
3561 +       do { toi_state = saved_state; } while (0)
3562 +
3563 +/*             == Module support ==            */
3564 +
3565 +struct toi_core_fns {
3566 +       int (*post_context_save)(void);
3567 +       unsigned long (*get_nonconflicting_page)(void);
3568 +       int (*try_hibernate)(int have_pmsem);
3569 +       void (*try_resume)(void);
3570 +};
3571 +
3572 +extern struct toi_core_fns *toi_core_fns;
3573 +
3574 +/*             == All else ==                  */
3575 +#define KB(x) ((x) << (PAGE_SHIFT - 10))
3576 +#define MB(x) ((x) >> (20 - PAGE_SHIFT))
3577 +
3578 +extern int toi_start_anything(int toi_or_resume);
3579 +extern void toi_finish_anything(int toi_or_resume);
3580 +
3581 +extern int save_image_part1(void);
3582 +extern int toi_atomic_restore(void);
3583 +
3584 +extern int _toi_try_hibernate(int have_pmsem);
3585 +extern void __toi_try_resume(void);
3586 +
3587 +extern int __toi_post_context_save(void);
3588 +
3589 +extern unsigned int nr_hibernates;
3590 +extern char alt_resume_param[256];
3591 +
3592 +extern void copyback_post(void);
3593 +extern int toi_hibernate(void);
3594 +extern long extra_pd1_pages_used;
3595 +
3596 +#define SECTOR_SIZE 512
3597 +
3598 +extern void toi_early_boot_message(int can_erase_image, int default_answer,
3599 +       char *warning_reason, ...);
3600 +
3601 +static inline int load_direct(struct page *page)
3602 +{
3603 +       return test_action_state(TOI_NO_DIRECT_LOAD) ? 0 :
3604 +               PagePageset1Copy(page);
3605 +}
3606 +
3607 +extern int pre_resume_freeze(void);
3608 +extern int do_check_can_resume(void);
3609 +extern int do_toi_step(int step);
3610 +extern int toi_launch_userspace_program(char *command, int channel_no,
3611 +               enum umh_wait wait);
3612 +
3613 +extern char * tuxonice_signature;
3614 +#endif
3615 diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
3616 new file mode 100644
3617 index 0000000..101f65b
3618 --- /dev/null
3619 +++ b/kernel/power/tuxonice_alloc.c
3620 @@ -0,0 +1,293 @@
3621 +/*
3622 + * kernel/power/tuxonice_alloc.c
3623 + *
3624 + * Copyright (C) 2007 Nigel Cunningham (nigel at tuxonice net)
3625 + *
3626 + * This file is released under the GPLv2.
3627 + *
3628 + */
3629 +
3630 +#ifdef CONFIG_PM_DEBUG
3631 +#include <linux/slab.h>
3632 +#include <linux/module.h>
3633 +#include "tuxonice_modules.h"
3634 +#include "tuxonice_sysfs.h"
3635 +
3636 +#define TOI_ALLOC_PATHS 39
3637 +
3638 +DEFINE_MUTEX(toi_alloc_mutex);
3639 +
3640 +static struct toi_module_ops toi_alloc_ops;
3641 +
3642 +static int toi_fail_num;
3643 +static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
3644 +               toi_free_count[TOI_ALLOC_PATHS],
3645 +               toi_test_count[TOI_ALLOC_PATHS],
3646 +               toi_fail_count[TOI_ALLOC_PATHS];
3647 +int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
3648 +int cur_allocd, max_allocd;
3649 +
3650 +static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
3651 +       "", /* 0 */
3652 +       "get_io_info_struct",
3653 +       "extent",
3654 +       "extent (loading chain)",
3655 +       "userui channel",
3656 +       "userui arg", /* 5 */
3657 +       "attention list metadata",
3658 +       "extra pagedir memory metadata",
3659 +       "bdev metadata",
3660 +       "extra pagedir memory",
3661 +       "header_locations_read", /* 10 */
3662 +       "bio queue",
3663 +       "prepare_readahead",
3664 +       "i/o buffer",
3665 +       "writer buffer in bio_init",
3666 +       "checksum buffer", /* 15 */
3667 +       "compression buffer",
3668 +       "filewriter signature op",
3669 +       "set resume param alloc1",
3670 +       "set resume param alloc2",
3671 +       "debugging info buffer", /* 20 */
3672 +       "check can resume buffer",
3673 +       "write module config buffer",
3674 +       "read module config buffer",
3675 +       "write image header buffer",
3676 +       "read pageset1 buffer", /* 25 */
3677 +       "get_have_image_data buffer",
3678 +       "checksum page",
3679 +       "worker rw loop",
3680 +       "get nonconflicting page",
3681 +       "ps1 load addresses", /* 30 */
3682 +       "remove swap image",
3683 +       "swap image exists",
3684 +       "swap parse sig location",
3685 +       "sysfs kobj",
3686 +       "swap mark resume attempted buffer", /* 35 */
3687 +       "cluster member",
3688 +       "boot kernel data buffer",
3689 +       "setting swap signature"
3690 +};
3691 +
3692 +#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
3693 +       do { \
3694 +               BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
3695 +               \
3696 +               if (FAIL_NUM == toi_fail_num) { \
3697 +                       atomic_inc(&toi_test_count[FAIL_NUM]); \
3698 +                       toi_fail_num = 0; \
3699 +                       return FAIL_VAL; \
3700 +               } \
3701 +       } while (0)
3702 +
3703 +static void alloc_update_stats(int fail_num, void *result)
3704 +{
3705 +       if (!result) {
3706 +               atomic_inc(&toi_fail_count[fail_num]);
3707 +               return;
3708 +       }
3709 +
3710 +       atomic_inc(&toi_alloc_count[fail_num]);
3711 +       if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
3712 +               mutex_lock(&toi_alloc_mutex);
3713 +               toi_cur_allocd[fail_num]++;
3714 +               cur_allocd++;
3715 +               if (unlikely(cur_allocd > max_allocd)) {
3716 +                       int i;
3717 +
3718 +                       for (i = 0; i < TOI_ALLOC_PATHS; i++)
3719 +                               toi_max_allocd[i] = toi_cur_allocd[i];
3720 +                       max_allocd = cur_allocd;
3721 +               }
3722 +               mutex_unlock(&toi_alloc_mutex);
3723 +       }
3724 +}
3725 +
3726 +static void free_update_stats(int fail_num)
3727 +{
3728 +       BUG_ON(fail_num >= TOI_ALLOC_PATHS);
3729 +       atomic_inc(&toi_free_count[fail_num]);
3730 +       if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
3731 +               mutex_lock(&toi_alloc_mutex);
3732 +               cur_allocd--;
3733 +               toi_cur_allocd[fail_num]--;
3734 +               mutex_unlock(&toi_alloc_mutex);
3735 +       }
3736 +}
3737 +
3738 +void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
3739 +{
3740 +       void *result;
3741 +
3742 +       if (toi_alloc_ops.enabled)
3743 +               MIGHT_FAIL(fail_num, NULL);
3744 +       result = kzalloc(size, flags);
3745 +       if (toi_alloc_ops.enabled)
3746 +               alloc_update_stats(fail_num, result);
3747 +       return result;
3748 +}
3749 +
3750 +unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
3751 +               unsigned int order)
3752 +{
3753 +       unsigned long result;
3754 +
3755 +       if (toi_alloc_ops.enabled)
3756 +               MIGHT_FAIL(fail_num, 0);
3757 +       result = __get_free_pages(mask, order);
3758 +       if (toi_alloc_ops.enabled)
3759 +               alloc_update_stats(fail_num, (void *) result);
3760 +       return result;
3761 +}
3762 +
3763 +struct page *toi_alloc_page(int fail_num, gfp_t mask)
3764 +{
3765 +       struct page *result;
3766 +
3767 +       if (toi_alloc_ops.enabled)
3768 +               MIGHT_FAIL(fail_num, 0);
3769 +       result = alloc_page(mask);
3770 +       if (toi_alloc_ops.enabled)
3771 +               alloc_update_stats(fail_num, (void *) result);
3772 +       return result;
3773 +}
3774 +
3775 +unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
3776 +{
3777 +       unsigned long result;
3778 +
3779 +       if (toi_alloc_ops.enabled)
3780 +               MIGHT_FAIL(fail_num, 0);
3781 +       result = get_zeroed_page(mask);
3782 +       if (toi_alloc_ops.enabled)
3783 +               alloc_update_stats(fail_num, (void *) result);
3784 +       return result;
3785 +}
3786 +
3787 +void toi_kfree(int fail_num, const void *arg)
3788 +{
3789 +       if (arg && toi_alloc_ops.enabled)
3790 +               free_update_stats(fail_num);
3791 +
3792 +       kfree(arg);
3793 +}
3794 +
3795 +void toi_free_page(int fail_num, unsigned long virt)
3796 +{
3797 +       if (virt && toi_alloc_ops.enabled)
3798 +               free_update_stats(fail_num);
3799 +
3800 +       free_page(virt);
3801 +}
3802 +
3803 +void toi__free_page(int fail_num, struct page *page)
3804 +{
3805 +       if (page && toi_alloc_ops.enabled)
3806 +               free_update_stats(fail_num);
3807 +
3808 +       __free_page(page);
3809 +}
3810 +
3811 +void toi_free_pages(int fail_num, struct page *page, int order)
3812 +{
3813 +       if (page && toi_alloc_ops.enabled)
3814 +               free_update_stats(fail_num);
3815 +
3816 +       __free_pages(page, order);
3817 +}
3818 +
3819 +void toi_alloc_print_debug_stats(void)
3820 +{
3821 +       int i, header_done = 0;
3822 +
3823 +       if (!toi_alloc_ops.enabled)
3824 +               return;
3825 +
3826 +       for (i = 0; i < TOI_ALLOC_PATHS; i++)
3827 +               if (atomic_read(&toi_alloc_count[i]) !=
3828 +                   atomic_read(&toi_free_count[i])) {
3829 +                       if (!header_done) {
3830 +                               printk(KERN_INFO "Idx  Allocs   Frees   Tests "
3831 +                                       "  Fails Max     Description\n");
3832 +                               header_done = 1;
3833 +                       }
3834 +
3835 +                       printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
3836 +                               atomic_read(&toi_alloc_count[i]),
3837 +                               atomic_read(&toi_free_count[i]),
3838 +                               atomic_read(&toi_test_count[i]),
3839 +                               atomic_read(&toi_fail_count[i]),
3840 +                               toi_max_allocd[i],
3841 +                               toi_alloc_desc[i]);
3842 +               }
3843 +}
3844 +EXPORT_SYMBOL_GPL(toi_alloc_print_debug_stats);
3845 +
3846 +static int toi_alloc_initialise(int starting_cycle)
3847 +{
3848 +       int i;
3849 +
3850 +       if (starting_cycle && toi_alloc_ops.enabled) {
3851 +               for (i = 0; i < TOI_ALLOC_PATHS; i++) {
3852 +                       atomic_set(&toi_alloc_count[i], 0);
3853 +                       atomic_set(&toi_free_count[i], 0);
3854 +                       atomic_set(&toi_test_count[i], 0);
3855 +                       atomic_set(&toi_fail_count[i], 0);
3856 +                       toi_cur_allocd[i] = 0;
3857 +                       toi_max_allocd[i] = 0;
3858 +               };
3859 +               max_allocd = 0;
3860 +               cur_allocd = 0;
3861 +       }
3862 +
3863 +       return 0;
3864 +}
3865 +
3866 +static struct toi_sysfs_data sysfs_params[] = {
3867 +       { TOI_ATTR("failure_test", SYSFS_RW),
3868 +         SYSFS_INT(&toi_fail_num, 0, 99, 0)
3869 +       },
3870 +
3871 +       { TOI_ATTR("find_max_mem_allocated", SYSFS_RW),
3872 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_GET_MAX_MEM_ALLOCD, 0)
3873 +       },
3874 +
3875 +       { TOI_ATTR("enabled", SYSFS_RW),
3876 +         SYSFS_INT(&toi_alloc_ops.enabled, 0, 1, 0)
3877 +       }
3878 +};
3879 +
3880 +static struct toi_module_ops toi_alloc_ops = {
3881 +       .type                                   = MISC_HIDDEN_MODULE,
3882 +       .name                                   = "allocation debugging",
3883 +       .directory                              = "alloc",
3884 +       .module                                 = THIS_MODULE,
3885 +       .early                                  = 1,
3886 +       .initialise                             = toi_alloc_initialise,
3887 +
3888 +       .sysfs_data             = sysfs_params,
3889 +       .num_sysfs_entries      = sizeof(sysfs_params) /
3890 +               sizeof(struct toi_sysfs_data),
3891 +};
3892 +
3893 +int toi_alloc_init(void)
3894 +{
3895 +       int result = toi_register_module(&toi_alloc_ops);
3896 +       toi_alloc_ops.enabled = 0;
3897 +       return result;
3898 +}
3899 +
3900 +void toi_alloc_exit(void)
3901 +{
3902 +       toi_unregister_module(&toi_alloc_ops);
3903 +}
3904 +#ifdef CONFIG_TOI_EXPORTS
3905 +EXPORT_SYMBOL_GPL(toi_kzalloc);
3906 +EXPORT_SYMBOL_GPL(toi_get_free_pages);
3907 +EXPORT_SYMBOL_GPL(toi_get_zeroed_page);
3908 +EXPORT_SYMBOL_GPL(toi_kfree);
3909 +EXPORT_SYMBOL_GPL(toi_free_page);
3910 +EXPORT_SYMBOL_GPL(toi__free_page);
3911 +EXPORT_SYMBOL_GPL(toi_alloc_page);
3912 +#endif
3913 +#endif
3914 diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
3915 new file mode 100644
3916 index 0000000..146c2bd
3917 --- /dev/null
3918 +++ b/kernel/power/tuxonice_alloc.h
3919 @@ -0,0 +1,51 @@
3920 +/*
3921 + * kernel/power/tuxonice_alloc.h
3922 + *
3923 + * Copyright (C) 2007 Nigel Cunningham (nigel at tuxonice net)
3924 + *
3925 + * This file is released under the GPLv2.
3926 + *
3927 + */
3928 +
3929 +#define TOI_WAIT_GFP (GFP_KERNEL | __GFP_NOWARN)
3930 +#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
3931 +
3932 +#ifdef CONFIG_PM_DEBUG
3933 +extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
3934 +extern void toi_kfree(int fail_num, const void *arg);
3935 +
3936 +extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
3937 +               unsigned int order);
3938 +#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
3939 +extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
3940 +extern void toi_free_page(int fail_num, unsigned long buf);
3941 +extern void toi__free_page(int fail_num, struct page *page);
3942 +extern void toi_free_pages(int fail_num, struct page *page, int order);
3943 +extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
3944 +extern int toi_alloc_init(void);
3945 +extern void toi_alloc_exit(void);
3946 +
3947 +extern void toi_alloc_print_debug_stats(void);
3948 +
3949 +#else /* CONFIG_PM_DEBUG */
3950 +
3951 +#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
3952 +#define toi_kfree(FAIL, ALLOCN) (kfree(ALLOCN))
3953 +
3954 +#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
3955 +#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
3956 +#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
3957 +#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
3958 +#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
3959 +#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
3960 +#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
3961 +static inline int toi_alloc_init(void)
3962 +{
3963 +       return 0;
3964 +}
3965 +
3966 +static inline void toi_alloc_exit(void) { }
3967 +
3968 +static inline void toi_alloc_print_debug_stats(void) { }
3969 +
3970 +#endif
3971 diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
3972 new file mode 100644
3973 index 0000000..597c0d9
3974 --- /dev/null
3975 +++ b/kernel/power/tuxonice_atomic_copy.c
3976 @@ -0,0 +1,378 @@
3977 +/*
3978 + * kernel/power/tuxonice_atomic_copy.c
3979 + *
3980 + * Copyright 2004-2007 Nigel Cunningham (nigel at tuxonice net)
3981 + * Copyright (C) 2006 Red Hat, inc.
3982 + *
3983 + * Distributed under GPLv2.
3984 + *
3985 + * Routines for doing the atomic save/restore.
3986 + */
3987 +
3988 +#include <linux/suspend.h>
3989 +#include <linux/highmem.h>
3990 +#include <linux/cpu.h>
3991 +#include <linux/freezer.h>
3992 +#include <linux/console.h>
3993 +#include "tuxonice.h"
3994 +#include "tuxonice_storage.h"
3995 +#include "tuxonice_power_off.h"
3996 +#include "tuxonice_ui.h"
3997 +#include "power.h"
3998 +#include "tuxonice_io.h"
3999 +#include "tuxonice_prepare_image.h"
4000 +#include "tuxonice_pageflags.h"
4001 +#include "tuxonice_checksum.h"
4002 +#include "tuxonice_builtin.h"
4003 +#include "tuxonice_atomic_copy.h"
4004 +#include "tuxonice_alloc.h"
4005 +
4006 +long extra_pd1_pages_used;
4007 +
4008 +/**
4009 + * free_pbe_list: Free page backup entries used by the atomic copy code.
4010 + *
4011 + * Normally, this function isn't used. If, however, we need to abort before
4012 + * doing the atomic copy, we use this to free the pbes previously allocated.
4013 + **/
4014 +static void free_pbe_list(struct pbe **list, int highmem)
4015 +{
4016 +       while (*list) {
4017 +               int i;
4018 +               struct pbe *free_pbe, *next_page = NULL;
4019 +               struct page *page;
4020 +
4021 +               if (highmem) {
4022 +                       page = (struct page *) *list;
4023 +                       free_pbe = (struct pbe *) kmap(page);
4024 +               } else {
4025 +                       page = virt_to_page(*list);
4026 +                       free_pbe = *list;
4027 +               }
4028 +
4029 +               for (i = 0; i < PBES_PER_PAGE; i++) {
4030 +                       if (!free_pbe)
4031 +                               break;
4032 +                       if (highmem)
4033 +                               toi__free_page(29, free_pbe->address);
4034 +                       else
4035 +                               toi_free_page(29,
4036 +                                       (unsigned long) free_pbe->address);
4037 +                       free_pbe = free_pbe->next;
4038 +               }
4039 +
4040 +               if (highmem) {
4041 +                       if (free_pbe)
4042 +                               next_page = free_pbe;
4043 +                       kunmap(page);
4044 +               } else {
4045 +                       if (free_pbe)
4046 +                               next_page = free_pbe;
4047 +               }
4048 +
4049 +               toi__free_page(29, page);
4050 +               *list = (struct pbe *) next_page;
4051 +       };
4052 +}
4053 +
4054 +/**
4055 + * copyback_post: Post atomic-restore actions.
4056 + *
4057 + * After doing the atomic restore, we have a few more things to do:
4058 + * 1) We want to retain some values across the restore, so we now copy
4059 + * these from the nosave variables to the normal ones.
4060 + * 2) Set the status flags.
4061 + * 3) Resume devices.
4062 + * 4) Tell userui so it can redraw & restore settings.
4063 + * 5) Reread the page cache.
4064 + **/
4065 +
4066 +void copyback_post(void)
4067 +{
4068 +       struct toi_boot_kernel_data *bkd =
4069 +               (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
4070 +
4071 +       /*
4072 +        * The boot kernel's data may be larger (newer version) or
4073 +        * smaller (older version) than ours. Copy the minimum
4074 +        * of the two sizes, so that we don't overwrite valid values
4075 +        * from pre-atomic copy.
4076 +        */
4077 +
4078 +       memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
4079 +                       min_t(int, sizeof(struct toi_boot_kernel_data),
4080 +                               bkd->size));
4081 +
4082 +       if (toi_activate_storage(1))
4083 +               panic("Failed to reactivate our storage.");
4084 +
4085 +       toi_ui_post_atomic_restore();
4086 +
4087 +       toi_cond_pause(1, "About to reload secondary pagedir.");
4088 +
4089 +       if (read_pageset2(0))
4090 +               panic("Unable to successfully reread the page cache.");
4091 +
4092 +       /*
4093 +        * If the user wants to sleep again after resuming from full-off,
4094 +        * it's most likely to be in order to suspend to ram, so we'll
4095 +        * do this check after loading pageset2, to give them the fastest
4096 +        * wakeup when they are ready to use the computer again.
4097 +        */
4098 +       toi_check_resleep();
4099 +}
4100 +
4101 +/**
4102 + * toi_copy_pageset1: Do the atomic copy of pageset1.
4103 + *
4104 + * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
4105 + * because we can't be sure what side effects it has. On my old Duron, with
4106 + * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
4107 + * count at resume time 4 instead of 3.
4108 + *
4109 + * We don't want to call kmap_atomic unconditionally because it has the side
4110 + * effect of incrementing the preempt count, which will leave it one too high
4111 + * post resume (the page containing the preempt count will be copied after
4112 + * its incremented. This is essentially the same problem.
4113 + **/
4114 +
4115 +void toi_copy_pageset1(void)
4116 +{
4117 +       int i;
4118 +       unsigned long source_index, dest_index;
4119 +
4120 +       source_index = get_next_bit_on(&pageset1_map, max_pfn + 1);
4121 +       dest_index = get_next_bit_on(&pageset1_copy_map, max_pfn + 1);
4122 +
4123 +       for (i = 0; i < pagedir1.size; i++) {
4124 +               unsigned long *origvirt, *copyvirt;
4125 +               struct page *origpage, *copypage;
4126 +               int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
4127 +
4128 +               origpage = pfn_to_page(source_index);
4129 +               copypage = pfn_to_page(dest_index);
4130 +
4131 +               origvirt = PageHighMem(origpage) ?
4132 +                       kmap_atomic(origpage, KM_USER0) :
4133 +                       page_address(origpage);
4134 +
4135 +               copyvirt = PageHighMem(copypage) ?
4136 +                       kmap_atomic(copypage, KM_USER1) :
4137 +                       page_address(copypage);
4138 +
4139 +               while (loop >= 0) {
4140 +                       *(copyvirt + loop) = *(origvirt + loop);
4141 +                       loop--;
4142 +               }
4143 +
4144 +               if (PageHighMem(origpage))
4145 +                       kunmap_atomic(origvirt, KM_USER0);
4146 +               else if (toi_faulted) {
4147 +                       printk(KERN_INFO "%p (%lu) being unmapped after "
4148 +                               "faulting during atomic copy.\n", origpage,
4149 +                               source_index);
4150 +                       kernel_map_pages(origpage, 1, 0);
4151 +                       clear_toi_fault();
4152 +               }
4153 +
4154 +               if (PageHighMem(copypage))
4155 +                       kunmap_atomic(copyvirt, KM_USER1);
4156 +
4157 +               source_index = get_next_bit_on(&pageset1_map, source_index);
4158 +               dest_index = get_next_bit_on(&pageset1_copy_map, dest_index);
4159 +       }
4160 +}
4161 +
4162 +/**
4163 + * __toi_post_context_save: Steps after saving the cpu context.
4164 + *
4165 + * Steps taken after saving the CPU state to make the actual
4166 + * atomic copy.
4167 + *
4168 + * Called from swsusp_save in snapshot.c via toi_post_context_save.
4169 + **/
4170 +
4171 +int __toi_post_context_save(void)
4172 +{
4173 +       long old_ps1_size = pagedir1.size;
4174 +
4175 +       check_checksums();
4176 +
4177 +       free_checksum_pages();
4178 +
4179 +       toi_recalculate_image_contents(1);
4180 +
4181 +       extra_pd1_pages_used = pagedir1.size - old_ps1_size;
4182 +
4183 +       if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
4184 +               printk(KERN_INFO "Pageset1 has grown by %ld pages. "
4185 +                       "extra_pages_allowance is currently only %lu.\n",
4186 +                       pagedir1.size - old_ps1_size,
4187 +                       extra_pd1_pages_allowance);
4188 +               set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
4189 +               return -1;
4190 +       }
4191 +
4192 +       if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
4193 +           !test_action_state(TOI_TEST_BIO))
4194 +               toi_copy_pageset1();
4195 +
4196 +       return 0;
4197 +}
4198 +
4199 +/**
4200 + * toi_hibernate: High level code for doing the atomic copy.
4201 + *
4202 + * High-level code which prepares to do the atomic copy. Loosely based
4203 + * on the swsusp version, but with the following twists:
4204 + * - We set toi_running so the swsusp code uses our code paths.
4205 + * - We give better feedback regarding what goes wrong if there is a problem.
4206 + * - We use an extra function to call the assembly, just in case this code
4207 + *   is in a module (return address).
4208 + **/
4209 +
4210 +int toi_hibernate(void)
4211 +{
4212 +       int error;
4213 +
4214 +       toi_running = 1; /* For the swsusp code we use :< */
4215 +
4216 +       error = toi_lowlevel_builtin();
4217 +
4218 +       toi_running = 0;
4219 +       return error;
4220 +}
4221 +
4222 +/**
4223 + * toi_atomic_restore: Prepare to do the atomic restore.
4224 + *
4225 + * Get ready to do the atomic restore. This part gets us into the same
4226 + * state we are in prior to do calling do_toi_lowlevel while
4227 + * hibernating: hot-unplugging secondary cpus and freeze processes,
4228 + * before starting the thread that will do the restore.
4229 + **/
4230 +
4231 +int toi_atomic_restore(void)
4232 +{
4233 +       int error;
4234 +
4235 +       toi_running = 1;
4236 +
4237 +       toi_prepare_status(DONT_CLEAR_BAR,      "Atomic restore.");
4238 +
4239 +       if (add_boot_kernel_data_pbe())
4240 +               goto Failed;
4241 +
4242 +       if (toi_go_atomic(PMSG_PRETHAW, 0))
4243 +               goto Failed;
4244 +
4245 +       /* We'll ignore saved state, but this gets preempt count (etc) right */
4246 +       save_processor_state();
4247 +
4248 +       error = swsusp_arch_resume();
4249 +       /*
4250 +        * Code below is only ever reached in case of failure. Otherwise
4251 +        * execution continues at place where swsusp_arch_suspend was called.
4252 +        *
4253 +        * We don't know whether it's safe to continue (this shouldn't happen),
4254 +        * so lets err on the side of caution.
4255 +        */
4256 +       BUG();
4257 +
4258 +Failed:
4259 +       free_pbe_list(&restore_pblist, 0);
4260 +#ifdef CONFIG_HIGHMEM
4261 +       free_pbe_list(&restore_highmem_pblist, 1);
4262 +#endif
4263 +       if (test_action_state(TOI_PM_PREPARE_CONSOLE))
4264 +               pm_restore_console();
4265 +       toi_running = 0;
4266 +       return 1;
4267 +}
4268 +
4269 +int toi_go_atomic(pm_message_t state, int suspend_time)
4270 +{
4271 +       toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
4272 +
4273 +       if (suspend_time && toi_platform_begin()) {
4274 +               set_abort_result(TOI_PLATFORM_PREP_FAILED);
4275 +               toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time);
4276 +               return 1;
4277 +       }
4278 +
4279 +       suspend_console();
4280 +
4281 +       if (device_suspend(state)) {
4282 +               set_abort_result(TOI_DEVICE_REFUSED);
4283 +               toi_end_atomic(ATOMIC_STEP_RESUME_CONSOLE, suspend_time);
4284 +               return 1;
4285 +       }
4286 +
4287 +       if (suspend_time && toi_platform_pre_snapshot()) {
4288 +               set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
4289 +               toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time);
4290 +               return 1;
4291 +       }
4292 +
4293 +       if (!suspend_time && toi_platform_pre_restore()) {
4294 +               set_abort_result(TOI_PRE_RESTORE_FAILED);
4295 +               toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time);
4296 +               return 1;
4297 +       }
4298 +
4299 +       if (test_action_state(TOI_LATE_CPU_HOTPLUG)) {
4300 +               if (disable_nonboot_cpus()) {
4301 +                       set_abort_result(TOI_CPU_HOTPLUG_FAILED);
4302 +                       toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
4303 +                                       suspend_time);
4304 +                       return 1;
4305 +               }
4306 +       }
4307 +
4308 +       if (suspend_time && arch_prepare_suspend()) {
4309 +               set_abort_result(TOI_ARCH_PREPARE_FAILED);
4310 +               toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, suspend_time);
4311 +               return 1;
4312 +       }
4313 +
4314 +       local_irq_disable();
4315 +
4316 +       /* At this point, device_suspend() has been called, but *not*
4317 +        * device_power_down(). We *must* device_power_down() now.
4318 +        * Otherwise, drivers for some devices (e.g. interrupt controllers)
4319 +        * become desynchronized with the actual state of the hardware
4320 +        * at resume time, and evil weirdness ensues.
4321 +        */
4322 +
4323 +       if (device_power_down(state)) {
4324 +               set_abort_result(TOI_DEVICE_REFUSED);
4325 +               toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time);
4326 +               return 1;
4327 +       }
4328 +
4329 +       return 0;
4330 +}
4331 +
4332 +void toi_end_atomic(int stage, int suspend_time)
4333 +{
4334 +       switch (stage) {
4335 +       case ATOMIC_ALL_STEPS:
4336 +               if (!suspend_time)
4337 +                       toi_platform_leave();
4338 +               device_power_up();
4339 +       case ATOMIC_STEP_IRQS:
4340 +               local_irq_enable();
4341 +       case ATOMIC_STEP_CPU_HOTPLUG:
4342 +               if (test_action_state(TOI_LATE_CPU_HOTPLUG))
4343 +                       enable_nonboot_cpus();
4344 +               toi_platform_finish();
4345 +       case ATOMIC_STEP_DEVICE_RESUME:
4346 +               device_resume();
4347 +       case ATOMIC_STEP_RESUME_CONSOLE:
4348 +               resume_console();
4349 +       case ATOMIC_STEP_PLATFORM_END:
4350 +               toi_platform_end();
4351 +
4352 +               toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
4353 +       }
4354 +}
4355 diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
4356 new file mode 100644
4357 index 0000000..5cb9dfe
4358 --- /dev/null
4359 +++ b/kernel/power/tuxonice_atomic_copy.h
4360 @@ -0,0 +1,21 @@
4361 +/*
4362 + * kernel/power/tuxonice_atomic_copy.h
4363 + *
4364 + * Copyright 2007 Nigel Cunningham (nigel at tuxonice net)
4365 + *
4366 + * Distributed under GPLv2.
4367 + *
4368 + * Routines for doing the atomic save/restore.
4369 + */
4370 +
4371 +enum {
4372 +       ATOMIC_ALL_STEPS,
4373 +       ATOMIC_STEP_IRQS,
4374 +       ATOMIC_STEP_CPU_HOTPLUG,
4375 +       ATOMIC_STEP_DEVICE_RESUME,
4376 +       ATOMIC_STEP_RESUME_CONSOLE,
4377 +       ATOMIC_STEP_PLATFORM_END,
4378 +};
4379 +
4380 +int toi_go_atomic(pm_message_t state, int toi_time);
4381 +void toi_end_atomic(int stage, int toi_time);
4382 diff --git a/kernel/power/tuxonice_block_io.c b/kernel/power/tuxonice_block_io.c
4383 new file mode 100644
4384 index 0000000..913e813
4385 --- /dev/null
4386 +++ b/kernel/power/tuxonice_block_io.c
4387 @@ -0,0 +1,1186 @@
4388 +/*
4389 + * kernel/power/tuxonice_block_io.c
4390 + *
4391 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
4392 + *
4393 + * Distributed under GPLv2.
4394 + *
4395 + * This file contains block io functions for TuxOnIce. These are
4396 + * used by the swapwriter and it is planned that they will also
4397 + * be used by the NFSwriter.
4398 + *
4399 + */
4400 +
4401 +#include <linux/blkdev.h>
4402 +#include <linux/syscalls.h>
4403 +#include <linux/suspend.h>
4404 +
4405 +#include "tuxonice.h"
4406 +#include "tuxonice_sysfs.h"
4407 +#include "tuxonice_modules.h"
4408 +#include "tuxonice_prepare_image.h"
4409 +#include "tuxonice_block_io.h"
4410 +#include "tuxonice_ui.h"
4411 +#include "tuxonice_alloc.h"
4412 +#include "tuxonice_io.h"
4413 +
4414 +
4415 +#if 0
4416 +static int pr_index;
4417 +
4418 +static inline void reset_pr_index(void)
4419 +{
4420 +       pr_index = 0;
4421 +}
4422 +
4423 +#define PR_DEBUG(a, b...) do { \
4424 +       if (pr_index < 20) \
4425 +               printk(a, ##b); \
4426 +} while (0)
4427 +
4428 +static inline void inc_pr_index(void)
4429 +{
4430 +       pr_index++;
4431 +}
4432 +#else
4433 +#define PR_DEBUG(a, b...) do { } while (0)
4434 +#define reset_pr_index() do { } while (0)
4435 +#define inc_pr_index do { } while (0)
4436 +#endif
4437 +
4438 +#define TARGET_OUTSTANDING_IO 16384
4439 +
4440 +#define MEASURE_MUTEX_CONTENTION
4441 +#ifndef MEASURE_MUTEX_CONTENTION
4442 +#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
4443 +#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
4444 +#else
4445 +unsigned long mutex_times[2][2][NR_CPUS];
4446 +#define my_mutex_lock(index, the_lock) do { \
4447 +       int have_mutex; \
4448 +       have_mutex = mutex_trylock(the_lock); \
4449 +       if (!have_mutex) { \
4450 +               mutex_lock(the_lock); \
4451 +               mutex_times[index][0][smp_processor_id()]++; \
4452 +       } else { \
4453 +               mutex_times[index][1][smp_processor_id()]++; \
4454 +       }
4455 +
4456 +#define my_mutex_unlock(index, the_lock) \
4457 +       mutex_unlock(the_lock); \
4458 +} while (0)
4459 +#endif
4460 +
4461 +static int target_outstanding_io = 1024;
4462 +static int max_outstanding_writes, max_outstanding_reads;
4463 +
4464 +static struct page *bio_queue_head, *bio_queue_tail;
4465 +static DEFINE_SPINLOCK(bio_queue_lock);
4466 +
4467 +static int free_mem_throttle;
4468 +static int more_readahead = 1;
4469 +static struct page *readahead_list_head, *readahead_list_tail;
4470 +
4471 +static struct page *waiting_on;
4472 +
4473 +static atomic_t toi_io_in_progress;
4474 +static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
4475 +
4476 +static int extra_page_forward;
4477 +
4478 +static int current_stream;
4479 +/* 0 = Header, 1 = Pageset1, 2 = Pageset2, 3 = End of PS1 */
4480 +struct extent_iterate_saved_state toi_writer_posn_save[4];
4481 +
4482 +/* Pointer to current entry being loaded/saved. */
4483 +struct extent_iterate_state toi_writer_posn;
4484 +
4485 +/* Not static, so that the allocators can setup and complete
4486 + * writing the header */
4487 +char *toi_writer_buffer;
4488 +int toi_writer_buffer_posn;
4489 +
4490 +static struct toi_bdev_info *toi_devinfo;
4491 +
4492 +DEFINE_MUTEX(toi_bio_mutex);
4493 +
4494 +static struct task_struct *toi_queue_flusher;
4495 +static int toi_bio_queue_flush_pages(int dedicated_thread);
4496 +
4497 +/**
4498 + * set_throttle: Set the point where we pause to avoid oom.
4499 + *
4500 + * Initially, this value is zero, but when we first fail to allocate memory,
4501 + * we set it (plus a buffer) and thereafter throttle i/o once that limit is
4502 + * reached.
4503 + */
4504 +
4505 +static void set_throttle(void)
4506 +{
4507 +       int new_throttle = nr_unallocated_buffer_pages() + 256;
4508 +
4509 +       if (new_throttle > free_mem_throttle)
4510 +               free_mem_throttle = new_throttle;
4511 +}
4512 +
4513 +#define NUM_REASONS 10
4514 +static atomic_t reasons[NUM_REASONS];
4515 +static char *reason_name[NUM_REASONS] = {
4516 +       "readahead not ready",
4517 +       "bio allocation",
4518 +       "io_struct allocation",
4519 +       "submit buffer",
4520 +       "synchronous I/O",
4521 +       "bio mutex when reading",
4522 +       "bio mutex when writing",
4523 +       "toi_bio_get_new_page",
4524 +       "memory low",
4525 +       "readahead buffer allocation"
4526 +};
4527 +
4528 +/**
4529 + * do_bio_wait: Wait for some TuxOnIce i/o to complete.
4530 + *
4531 + * Submit any I/O that's batched up (if we're not already doing
4532 + * that, schedule and clean up whatever we can.
4533 + */
4534 +static void do_bio_wait(int reason)
4535 +{
4536 +       struct page *was_waiting_on = waiting_on;
4537 +
4538 +       /* On SMP, waiting_on can be reset, so we make a copy */
4539 +       if (was_waiting_on) {
4540 +               if (PageLocked(was_waiting_on)) {
4541 +                       wait_on_page_bit(was_waiting_on, PG_locked);
4542 +                       atomic_inc(&reasons[reason]);
4543 +               }
4544 +       } else {
4545 +               atomic_inc(&reasons[reason]);
4546 +
4547 +               wait_event(num_in_progress_wait,
4548 +                       !atomic_read(&toi_io_in_progress) ||
4549 +                       nr_unallocated_buffer_pages() > free_mem_throttle);
4550 +       }
4551 +}
4552 +
4553 +static void throttle_if_memory_low(void)
4554 +{
4555 +       int free_pages = nr_unallocated_buffer_pages();
4556 +
4557 +       /* Getting low on memory and I/O is in progress? */
4558 +       while (unlikely(free_pages < free_mem_throttle) &&
4559 +                       atomic_read(&toi_io_in_progress)) {
4560 +               do_bio_wait(8);
4561 +               free_pages = nr_unallocated_buffer_pages();
4562 +       }
4563 +
4564 +       wait_event(num_in_progress_wait,
4565 +               atomic_read(&toi_io_in_progress) < target_outstanding_io);
4566 +}
4567 +
4568 +/**
4569 + * toi_finish_all_io: Complete all outstanding i/o.
4570 + */
4571 +static void toi_finish_all_io(void)
4572 +{
4573 +       wait_event(num_in_progress_wait, !atomic_read(&toi_io_in_progress));
4574 +}
4575 +
4576 +/**
4577 + * toi_end_bio: bio completion function.
4578 + *
4579 + * @bio: bio that has completed.
4580 + * @err: Error value. Yes, like end_swap_bio_read, we ignore it.
4581 + *
4582 + * Function called by block driver from interrupt context when I/O is completed.
4583 + * Nearly the fs/buffer.c version, but we want to do our cleanup too. We only
4584 + * free pages if they were buffers used when writing the image.
4585 + */
4586 +static void toi_end_bio(struct bio *bio, int err)
4587 +{
4588 +       struct page *page = bio->bi_io_vec[0].bv_page;
4589 +
4590 +       BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
4591 +
4592 +       unlock_page(page);
4593 +       bio_put(bio);
4594 +
4595 +       if (waiting_on == page)
4596 +               waiting_on = NULL;
4597 +
4598 +       put_page(page);
4599 +
4600 +       if (bio->bi_private)
4601 +               toi__free_page((int) ((unsigned long) bio->bi_private) , page);
4602 +
4603 +       bio_put(bio);
4604 +
4605 +       atomic_dec(&toi_io_in_progress);
4606 +
4607 +       wake_up(&num_in_progress_wait);
4608 +}
4609 +
4610 +/**
4611 + *     submit - submit BIO request.
4612 + *     @writing: READ or WRITE.
4613 + *
4614 + *     Based on Patrick's pmdisk code from long ago:
4615 + *     "Straight from the textbook - allocate and initialize the bio.
4616 + *     If we're writing, make sure the page is marked as dirty.
4617 + *     Then submit it and carry on."
4618 + *
4619 + *     With a twist, though - we handle block_size != PAGE_SIZE.
4620 + *     Caller has already checked that our page is not fragmented.
4621 + */
4622 +static int submit(int writing, struct block_device *dev, sector_t first_block,
4623 +               struct page *page, int free_group)
4624 +{
4625 +       struct bio *bio = NULL;
4626 +       int cur_outstanding_io;
4627 +
4628 +       throttle_if_memory_low();
4629 +
4630 +       while (!bio) {
4631 +               bio = bio_alloc(TOI_ATOMIC_GFP, 1);
4632 +               if (!bio) {
4633 +                       set_throttle();
4634 +                       do_bio_wait(1);
4635 +               }
4636 +       }
4637 +
4638 +       bio->bi_bdev = dev;
4639 +       bio->bi_sector = first_block;
4640 +       bio->bi_private = (void *) ((unsigned long) free_group);
4641 +       bio->bi_end_io = toi_end_bio;
4642 +
4643 +       if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
4644 +               printk(KERN_INFO "ERROR: adding page to bio at %lld\n",
4645 +                               (unsigned long long) first_block);
4646 +               bio_put(bio);
4647 +               return -EFAULT;
4648 +       }
4649 +
4650 +       bio_get(bio);
4651 +
4652 +       cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
4653 +       if (writing) {
4654 +               if (cur_outstanding_io > max_outstanding_writes)
4655 +                       max_outstanding_writes = cur_outstanding_io;
4656 +       } else {
4657 +               if (cur_outstanding_io > max_outstanding_reads)
4658 +                       max_outstanding_reads = cur_outstanding_io;
4659 +       }
4660 +
4661 +
4662 +       if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) {
4663 +               /* Fake having done the hard work */
4664 +               set_bit(BIO_UPTODATE, &bio->bi_flags);
4665 +               toi_end_bio(bio, 0);
4666 +       } else
4667 +               submit_bio(writing | (1 << BIO_RW_SYNC), bio);
4668 +
4669 +       return 0;
4670 +}
4671 +
4672 +/**
4673 + * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
4674 + *
4675 + * @writing: Whether reading or writing.
4676 + * @bdev: The block device which we're using.
4677 + * @block0: The first sector we're reading or writing.
4678 + * @page: The page on which I/O is being done.
4679 + * @readahead_index: If doing readahead, the index (reset this flag when done).
4680 + * @syncio: Whether the i/o is being done synchronously.
4681 + *
4682 + * Prepare and start a read or write operation.
4683 + *
4684 + * Note that we always work with our own page. If writing, we might be given a
4685 + * compression buffer that will immediately be used to start compressing the
4686 + * next page. For reading, we do readahead and therefore don't know the final
4687 + * address where the data needs to go.
4688 + */
4689 +static int toi_do_io(int writing, struct block_device *bdev, long block0,
4690 +       struct page *page, int is_readahead, int syncio, int free_group)
4691 +{
4692 +       page->private = 0;
4693 +
4694 +       /* Do here so we don't race against toi_bio_get_next_page_read */
4695 +       lock_page(page);
4696 +
4697 +       if (is_readahead) {
4698 +               if (readahead_list_head)
4699 +                       readahead_list_tail->private = (unsigned long) page;
4700 +               else
4701 +                       readahead_list_head = page;
4702 +
4703 +               readahead_list_tail = page;
4704 +       }
4705 +
4706 +       /* Done before submitting to avoid races. */
4707 +       if (syncio)
4708 +               waiting_on = page;
4709 +
4710 +       /* Submit the page */
4711 +       get_page(page);
4712 +
4713 +       if (submit(writing, bdev, block0, page, free_group))
4714 +               return -EFAULT;
4715 +
4716 +       if (syncio)
4717 +               do_bio_wait(4);
4718 +
4719 +       return 0;
4720 +}
4721 +
4722 +/**
4723 + * toi_bdev_page_io: Simpler interface to do directly i/o on a single page.
4724 + *
4725 + * @writing: Whether reading or writing.
4726 + * @bdev: Block device on which we're operating.
4727 + * @pos: Sector at which page to read starts.
4728 + * @page: Page to be read/written.
4729 + *
4730 + * We used to use bread here, but it doesn't correctly handle
4731 + * blocksize != PAGE_SIZE. Now we create a submit_info to get the data we
4732 + * want and use our normal routines (synchronously).
4733 + */
4734 +static int toi_bdev_page_io(int writing, struct block_device *bdev,
4735 +               long pos, struct page *page)
4736 +{
4737 +       return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
4738 +}
4739 +
4740 +/**
4741 + * toi_bio_memory_needed: Report amount of memory needed for block i/o.
4742 + *
4743 + * We want to have at least enough memory so as to have target_outstanding_io
4744 + * or more transactions on the fly at once. If we can do more, fine.
4745 + */
4746 +static int toi_bio_memory_needed(void)
4747 +{
4748 +       return (target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
4749 +                               sizeof(struct bio)));
4750 +}
4751 +
4752 +/*
4753 + * toi_bio_print_debug_stats
4754 + *
4755 + * Description:
4756 + */
4757 +static int toi_bio_print_debug_stats(char *buffer, int size)
4758 +{
4759 +       int len = snprintf_used(buffer, size, "- Max outstanding reads %d. Max "
4760 +                       "writes %d.\n", max_outstanding_reads,
4761 +                       max_outstanding_writes);
4762 +
4763 +       len += snprintf_used(buffer + len, size - len,
4764 +               "  Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
4765 +               target_outstanding_io,
4766 +               PAGE_SIZE, (unsigned int) sizeof(struct request),
4767 +               (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
4768 +
4769 +#ifdef MEASURE_MUTEX_CONTENTION
4770 +       {
4771 +       int i;
4772 +
4773 +       len += snprintf_used(buffer + len, size - len,
4774 +               "  Mutex contention while reading:\n  Contended      Free\n");
4775 +
4776 +       for (i = 0; i < NR_CPUS; i++)
4777 +               len += snprintf_used(buffer + len, size - len,
4778 +               "  %9lu %9lu\n",
4779 +               mutex_times[0][0][i], mutex_times[0][1][i]);
4780 +
4781 +       len += snprintf_used(buffer + len, size - len,
4782 +               "  Mutex contention while writing:\n  Contended      Free\n");
4783 +
4784 +       for (i = 0; i < NR_CPUS; i++)
4785 +               len += snprintf_used(buffer + len, size - len,
4786 +               "  %9lu %9lu\n",
4787 +               mutex_times[1][0][i], mutex_times[1][1][i]);
4788 +
4789 +       }
4790 +#endif
4791 +
4792 +       return len + snprintf_used(buffer + len, size - len,
4793 +               "  Free mem throttle point reached %d.\n", free_mem_throttle);
4794 +}
4795 +
4796 +/**
4797 + * toi_set_devinfo: Set the bdev info used for i/o.
4798 + *
4799 + * @info: Pointer to array of struct toi_bdev_info - the list of
4800 + * bdevs and blocks on them in which the image is stored.
4801 + *
4802 + * Set the list of bdevs and blocks in which the image will be stored.
4803 + * Sort of like putting a tape in the cassette player.
4804 + */
4805 +static void toi_set_devinfo(struct toi_bdev_info *info)
4806 +{
4807 +       toi_devinfo = info;
4808 +}
4809 +
4810 +/**
4811 + * dump_block_chains: Print the contents of the bdev info array.
4812 + */
4813 +static void dump_block_chains(void)
4814 +{
4815 +       int i;
4816 +
4817 +       for (i = 0; i < toi_writer_posn.num_chains; i++) {
4818 +               struct extent *this;
4819 +
4820 +               this = (toi_writer_posn.chains + i)->first;
4821 +
4822 +               if (!this)
4823 +                       continue;
4824 +
4825 +               printk(KERN_INFO "Chain %d:", i);
4826 +
4827 +               while (this) {
4828 +                       printk(" [%lu-%lu]%s", this->minimum,
4829 +                                       this->maximum, this->next ? "," : "");
4830 +                       this = this->next;
4831 +               }
4832 +
4833 +               printk("\n");
4834 +       }
4835 +
4836 +       for (i = 0; i < 4; i++)
4837 +               printk(KERN_INFO "Posn %d: Chain %d, extent %d, offset %lu.\n",
4838 +                               i, toi_writer_posn_save[i].chain_num,
4839 +                               toi_writer_posn_save[i].extent_num,
4840 +                               toi_writer_posn_save[i].offset);
4841 +}
4842 +
4843 +/**
4844 + * go_next_page: Skip blocks to the start of the next page.
4845 + *
4846 + * Go forward one page, or two if extra_page_forward is set. It only gets
4847 + * set at the start of reading the image header, to skip the first page
4848 + * of the header, which is read without using the extent chains.
4849 + */
4850 +static int go_next_page(int writing)
4851 +{
4852 +       int i, max = (toi_writer_posn.current_chain == -1) ? 1 :
4853 +         toi_devinfo[toi_writer_posn.current_chain].blocks_per_page;
4854 +
4855 +       for (i = 0; i < max; i++)
4856 +               toi_extent_state_next(&toi_writer_posn);
4857 +
4858 +       if (toi_extent_state_eof(&toi_writer_posn)) {
4859 +               /* Don't complain if readahead falls off the end */
4860 +               if (writing) {
4861 +                       printk(KERN_INFO "Extent state eof. "
4862 +                               "Expected compression ratio too optimistic?\n");
4863 +                       dump_block_chains();
4864 +               }
4865 +               return -ENODATA;
4866 +       }
4867 +
4868 +       if (extra_page_forward) {
4869 +               extra_page_forward = 0;
4870 +               return go_next_page(writing);
4871 +       }
4872 +
4873 +       return 0;
4874 +}
4875 +
4876 +/**
4877 + * set_extra_page_forward: Make us skip an extra page on next go_next_page.
4878 + *
4879 + * Used in reading header, to jump to 2nd page after getting 1st page
4880 + * direct from image header.
4881 + */
4882 +static void set_extra_page_forward(void)
4883 +{
4884 +       extra_page_forward = 1;
4885 +}
4886 +
4887 +/**
4888 + * toi_bio_rw_page: Do i/o on the next disk page in the image.
4889 + *
4890 + * @writing: Whether reading or writing.
4891 + * @page: Page to do i/o on.
4892 + * @readahead_index: -1 or the index in the readahead ring.
4893 + *
4894 + * Submit a page for reading or writing, possibly readahead.
4895 + */
4896 +static int toi_bio_rw_page(int writing, struct page *page,
4897 +               int is_readahead, int free_group)
4898 +{
4899 +       struct toi_bdev_info *dev_info;
4900 +       int result;
4901 +
4902 +       if (go_next_page(writing)) {
4903 +               printk(KERN_INFO "Failed to advance a page in the extent "
4904 +                               "data.\n");
4905 +               return -ENODATA;
4906 +       }
4907 +
4908 +       if (current_stream == 0 && writing &&
4909 +               toi_writer_posn.current_chain ==
4910 +                       toi_writer_posn_save[2].chain_num &&
4911 +               toi_writer_posn.current_offset ==
4912 +                       toi_writer_posn_save[2].offset) {
4913 +               dump_block_chains();
4914 +               BUG();
4915 +       }
4916 +
4917 +       dev_info = &toi_devinfo[toi_writer_posn.current_chain];
4918 +
4919 +       result = toi_do_io(writing, dev_info->bdev,
4920 +               toi_writer_posn.current_offset <<
4921 +                       dev_info->bmap_shift,
4922 +               page, is_readahead, 0, free_group);
4923 +
4924 +       if (result) {
4925 +               more_readahead = 0;
4926 +               return result;
4927 +       }
4928 +
4929 +       if (!writing) {
4930 +               int compare_to = 0;
4931 +
4932 +               switch (current_stream) {
4933 +               case 0:
4934 +                       compare_to = 2;
4935 +                       break;
4936 +               case 1:
4937 +                       compare_to = 3;
4938 +                       break;
4939 +               case 2:
4940 +                       compare_to = 1;
4941 +                       break;
4942 +               }
4943 +
4944 +               if (toi_writer_posn.current_chain ==
4945 +                               toi_writer_posn_save[compare_to].chain_num &&
4946 +                   toi_writer_posn.current_offset ==
4947 +                               toi_writer_posn_save[compare_to].offset)
4948 +                       more_readahead = 0;
4949 +       }
4950 +       return 0;
4951 +}
4952 +
4953 +/**
4954 + * toi_rw_init: Prepare to read or write a stream in the image.
4955 + *
4956 + * @writing: Whether reading or writing.
4957 + * @stream number: Section of the image being processed.
4958 + */
4959 +static int toi_rw_init(int writing, int stream_number)
4960 +{
4961 +       if (stream_number)
4962 +               toi_extent_state_restore(&toi_writer_posn,
4963 +                               &toi_writer_posn_save[stream_number]);
4964 +       else
4965 +               toi_extent_state_goto_start(&toi_writer_posn);
4966 +
4967 +       toi_writer_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
4968 +       toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
4969 +
4970 +       current_stream = stream_number;
4971 +
4972 +       reset_pr_index();
4973 +       more_readahead = 1;
4974 +
4975 +       return toi_writer_buffer ? 0 : -ENOMEM;
4976 +}
4977 +
4978 +/**
4979 + * toi_read_header_init: Prepare to read the image header.
4980 + *
4981 + * Reset readahead indices prior to starting to read a section of the image.
4982 + */
4983 +static void toi_read_header_init(void)
4984 +{
4985 +       toi_writer_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
4986 +       more_readahead = 1;
4987 +}
4988 +
4989 +/*
4990 + * toi_bio_queue_write
4991 + */
4992 +static void toi_bio_queue_write(char **full_buffer)
4993 +{
4994 +       struct page *page = virt_to_page(*full_buffer);
4995 +       unsigned long flags;
4996 +
4997 +       page->private = 0;
4998 +
4999 +       spin_lock_irqsave(&bio_queue_lock, flags);
5000 +       if (!bio_queue_head)
5001 +               bio_queue_head = page;
5002 +       else
5003 +               bio_queue_tail->private = (unsigned long) page;
5004 +
5005 +       bio_queue_tail = page;
5006 +
5007 +       spin_unlock_irqrestore(&bio_queue_lock, flags);
5008 +       wake_up(&toi_io_queue_flusher);
5009 +
5010 +       *full_buffer = NULL;
5011 +}
5012 +
5013 +/**
5014 + * toi_rw_cleanup: Cleanup after i/o.
5015 + *
5016 + * @writing: Whether we were reading or writing.
5017 + */
5018 +static int toi_rw_cleanup(int writing)
5019 +{
5020 +       int i;
5021 +
5022 +       if (writing) {
5023 +               if (toi_writer_buffer_posn)
5024 +                       toi_bio_queue_write(&toi_writer_buffer);
5025 +
5026 +               toi_bio_queue_flush_pages(0);
5027 +
5028 +               if (current_stream == 2)
5029 +                       toi_extent_state_save(&toi_writer_posn,
5030 +                                       &toi_writer_posn_save[1]);
5031 +               else if (current_stream == 1)
5032 +                       toi_extent_state_save(&toi_writer_posn,
5033 +                                       &toi_writer_posn_save[3]);
5034 +       }
5035 +
5036 +       toi_finish_all_io();
5037 +
5038 +       while (readahead_list_head) {
5039 +               void *next = (void *) readahead_list_head->private;
5040 +               toi__free_page(12, readahead_list_head);
5041 +               readahead_list_head = next;
5042 +       }
5043 +
5044 +       readahead_list_tail = NULL;
5045 +
5046 +       if (!current_stream)
5047 +               return 0;
5048 +
5049 +       for (i = 0; i < NUM_REASONS; i++) {
5050 +               if (!atomic_read(&reasons[i]))
5051 +                       continue;
5052 +               printk(KERN_INFO "Waited for i/o due to %s %d times.\n",
5053 +                               reason_name[i], atomic_read(&reasons[i]));
5054 +               atomic_set(&reasons[i], 0);
5055 +       }
5056 +
5057 +       current_stream = 0;
5058 +       return 0;
5059 +}
5060 +
5061 +int toi_start_one_readahead(int dedicated_thread)
5062 +{
5063 +       char *buffer = NULL;
5064 +       int oom = 0;
5065 +
5066 +       throttle_if_memory_low();
5067 +
5068 +       while (!buffer) {
5069 +               buffer = (char *) toi_get_zeroed_page(12,
5070 +                               TOI_ATOMIC_GFP);
5071 +               if (!buffer) {
5072 +                       if (oom && !dedicated_thread)
5073 +                               return -EIO;
5074 +
5075 +                       oom = 1;
5076 +                       set_throttle();
5077 +                       do_bio_wait(9);
5078 +               }
5079 +       }
5080 +
5081 +       return toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
5082 +}
5083 +
5084 +/*
5085 + * toi_start_new_readahead
5086 + *
5087 + * Start readahead of image pages.
5088 + *
5089 + * No mutex needed because this is only ever called by one cpu.
5090 + */
5091 +static int toi_start_new_readahead(int dedicated_thread)
5092 +{
5093 +       int last_result, num_submitted = 0;
5094 +
5095 +       /* Start a new readahead? */
5096 +       if (!more_readahead)
5097 +               return 0;
5098 +
5099 +       do {
5100 +               int result = toi_start_one_readahead(dedicated_thread);
5101 +
5102 +               if (result == -EIO)
5103 +                       return 0;
5104 +               else
5105 +                       last_result = result;
5106 +
5107 +               if (last_result == -ENODATA)
5108 +                       more_readahead = 0;
5109 +
5110 +               if (!more_readahead && last_result) {
5111 +                       /*
5112 +                        * Don't complain about failing to do readahead past
5113 +                        * the end of storage.
5114 +                        */
5115 +                       if (last_result != -ENODATA)
5116 +                               printk(KERN_INFO
5117 +                                       "Begin read chunk returned %d.\n",
5118 +                                       last_result);
5119 +               } else
5120 +                       num_submitted++;
5121 +
5122 +       } while (more_readahead &&
5123 +                (dedicated_thread ||
5124 +                 (num_submitted < target_outstanding_io &&
5125 +                  atomic_read(&toi_io_in_progress) < target_outstanding_io)));
5126 +       return 0;
5127 +}
5128 +
5129 +static void bio_io_flusher(int writing)
5130 +{
5131 +
5132 +       if (writing)
5133 +               toi_bio_queue_flush_pages(1);
5134 +       else
5135 +               toi_start_new_readahead(1);
5136 +}
5137 +
5138 +/**
5139 + * toi_bio_get_next_page_read: Read a disk page with readahead.
5140 + *
5141 + * Read a page from disk, submitting readahead and cleaning up finished i/o
5142 + * while we wait for the page we're after.
5143 + */
5144 +static int toi_bio_get_next_page_read(int no_readahead)
5145 +{
5146 +       unsigned long *virt;
5147 +       struct page *next;
5148 +
5149 +       /*
5150 +        * When reading the second page of the header, we have to
5151 +        * delay submitting the read until after we've gotten the
5152 +        * extents out of the first page.
5153 +        */
5154 +       if (unlikely(no_readahead && toi_start_one_readahead(0))) {
5155 +               printk("No readahead and toi_start_one_readahead returned non-zero.\n");
5156 +               return -EIO;
5157 +       }
5158 +
5159 +       /*
5160 +        * On SMP, we may need to wait for the first readahead
5161 +        * to be submitted.
5162 +        */
5163 +       if (unlikely(!readahead_list_head)) {
5164 +               BUG_ON(!more_readahead);
5165 +               do {
5166 +                       cpu_relax();
5167 +               } while (!readahead_list_head);
5168 +       }
5169 +
5170 +       if (PageLocked(readahead_list_head)) {
5171 +               waiting_on = readahead_list_head;
5172 +               do_bio_wait(0);
5173 +       }
5174 +
5175 +       virt = page_address(readahead_list_head);
5176 +       memcpy(toi_writer_buffer, virt, PAGE_SIZE);
5177 +
5178 +       next = (struct page *) readahead_list_head->private;
5179 +       toi__free_page(12, readahead_list_head);
5180 +       readahead_list_head = next;
5181 +       return 0;
5182 +}
5183 +
5184 +/*
5185 + * toi_bio_queue_flush_pages
5186 + */
5187 +
5188 +static int toi_bio_queue_flush_pages(int dedicated_thread)
5189 +{
5190 +       unsigned long flags;
5191 +       int result = 0;
5192 +
5193 +top:
5194 +       spin_lock_irqsave(&bio_queue_lock, flags);
5195 +       while (bio_queue_head) {
5196 +               struct page *page = bio_queue_head;
5197 +               bio_queue_head = (struct page *) page->private;
5198 +               if (bio_queue_tail == page)
5199 +                       bio_queue_tail = NULL;
5200 +               spin_unlock_irqrestore(&bio_queue_lock, flags);
5201 +               result = toi_bio_rw_page(WRITE, page, 0, 11);
5202 +               if (result)
5203 +                       return result;
5204 +               spin_lock_irqsave(&bio_queue_lock, flags);
5205 +       }
5206 +       spin_unlock_irqrestore(&bio_queue_lock, flags);
5207 +
5208 +       if (dedicated_thread) {
5209 +               wait_event(toi_io_queue_flusher, bio_queue_head ||
5210 +                               toi_bio_queue_flusher_should_finish);
5211 +               if (likely(!toi_bio_queue_flusher_should_finish))
5212 +                       goto top;
5213 +               toi_bio_queue_flusher_should_finish = 0;
5214 +       }
5215 +       return 0;
5216 +}
5217 +
5218 +/*
5219 + * toi_bio_get_new_page
5220 + */
5221 +static void toi_bio_get_new_page(char **full_buffer)
5222 +{
5223 +       throttle_if_memory_low();
5224 +
5225 +       while (!*full_buffer) {
5226 +               *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
5227 +               if (!*full_buffer) {
5228 +                       set_throttle();
5229 +                       do_bio_wait(7);
5230 +               }
5231 +       }
5232 +}
5233 +
5234 +/*
5235 + * toi_rw_buffer: Combine smaller buffers into PAGE_SIZE I/O.
5236 + *
5237 + * @writing: Bool - whether writing (or reading).
5238 + * @buffer: The start of the buffer to write or fill.
5239 + * @buffer_size: The size of the buffer to write or fill.
5240 + */
5241 +static int toi_rw_buffer(int writing, char *buffer, int buffer_size, int no_readahead)
5242 +{
5243 +       int bytes_left = buffer_size;
5244 +
5245 +       while (bytes_left) {
5246 +               char *source_start = buffer + buffer_size - bytes_left;
5247 +               char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
5248 +               int capacity = PAGE_SIZE - toi_writer_buffer_posn;
5249 +               char *to = writing ? dest_start : source_start;
5250 +               char *from = writing ? source_start : dest_start;
5251 +
5252 +               if (bytes_left <= capacity) {
5253 +                       memcpy(to, from, bytes_left);
5254 +                       toi_writer_buffer_posn += bytes_left;
5255 +                       return 0;
5256 +               }
5257 +
5258 +               /* Complete this page and start a new one */
5259 +               memcpy(to, from, capacity);
5260 +               bytes_left -= capacity;
5261 +
5262 +               if (!writing) {
5263 +                       int result = toi_bio_get_next_page_read(no_readahead);
5264 +                       if (result)
5265 +                               return result;
5266 +               } else {
5267 +                       toi_bio_queue_write(&toi_writer_buffer);
5268 +                       toi_bio_get_new_page(&toi_writer_buffer);
5269 +               }
5270 +
5271 +               toi_writer_buffer_posn = 0;
5272 +               toi_cond_pause(0, NULL);
5273 +       }
5274 +
5275 +       return 0;
5276 +}
5277 +
5278 +/**
5279 + * toi_bio_read_page - read a page of the image.
5280 + *
5281 + * @pfn: The pfn where the data belongs.
5282 + * @buffer_page: The page containing the (possibly compressed) data.
5283 + * @buf_size: The number of bytes on @buffer_page used.
5284 + *
5285 + * Read a (possibly compressed) page from the image, into buffer_page,
5286 + * returning its pfn and the buffer size.
5287 + */
5288 +static int toi_bio_read_page(unsigned long *pfn, struct page *buffer_page,
5289 +               unsigned int *buf_size)
5290 +{
5291 +       int result = 0;
5292 +       char *buffer_virt = kmap(buffer_page);
5293 +
5294 +       inc_pr_index;
5295 +
5296 +       /* Only call start_new_readahead if we don't have a dedicated thread */
5297 +       if (current == toi_queue_flusher && toi_start_new_readahead(0)) {
5298 +               printk("Queue flusher and toi_start_one_readahead returned non-zero.\n");
5299 +               return -EIO;
5300 +       }
5301 +
5302 +       my_mutex_lock(0, &toi_bio_mutex);
5303 +
5304 +       if (toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
5305 +           toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
5306 +           toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
5307 +               abort_hibernate(TOI_FAILED_IO, "Read of data failed.");
5308 +               result = 1;
5309 +       } else
5310 +               PR_DEBUG("%d: PFN %ld, %d bytes.\n", pr_index, *pfn, *buf_size);
5311 +
5312 +       my_mutex_unlock(0, &toi_bio_mutex);
5313 +       kunmap(buffer_page);
5314 +       return result;
5315 +}
5316 +
5317 +/**
5318 + * toi_bio_write_page - Write a page of the image.
5319 + *
5320 + * @pfn: The pfn where the data belongs.
5321 + * @buffer_page: The page containing the (possibly compressed) data.
5322 + * @buf_size: The number of bytes on @buffer_page used.
5323 + *
5324 + * Write a (possibly compressed) page to the image from the buffer, together
5325 + * with it's index and buffer size.
5326 + */
5327 +static int toi_bio_write_page(unsigned long pfn, struct page *buffer_page,
5328 +               unsigned int buf_size)
5329 +{
5330 +       char *buffer_virt;
5331 +       int result = 0, result2 = 0;
5332 +
5333 +       inc_pr_index;
5334 +
5335 +       if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
5336 +               return 0;
5337 +
5338 +       my_mutex_lock(1, &toi_bio_mutex);
5339 +       buffer_virt = kmap(buffer_page);
5340 +
5341 +       if (toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
5342 +           toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
5343 +           toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
5344 +               printk("toi_rw_buffer returned non-zero to toi_bio_write_page.\n");
5345 +               result = -EIO;
5346 +       }
5347 +
5348 +       PR_DEBUG("%d: Index %ld, %d bytes. Result %d.\n", pr_index, pfn,
5349 +                       buf_size, result);
5350 +
5351 +       kunmap(buffer_page);
5352 +       my_mutex_unlock(1, &toi_bio_mutex);
5353 +
5354 +       if (current == toi_queue_flusher)
5355 +               result2 = toi_bio_queue_flush_pages(0);
5356 +
5357 +       return result ? result : result2;
5358 +}
5359 +
5360 +/**
5361 + * toi_rw_header_chunk: Read or write a portion of the image header.
5362 + *
5363 + * @writing: Whether reading or writing.
5364 + * @owner: The module for which we're writing. Used for confirming that modules
5365 + * don't use more header space than they asked for.
5366 + * @buffer: Address of the data to write.
5367 + * @buffer_size: Size of the data buffer.
5368 + * @no_readahead: Don't try to start readhead (when still getting extents)
5369 + */
5370 +static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
5371 +               char *buffer, int buffer_size, int no_readahead)
5372 +{
5373 +       int result = 0;
5374 +
5375 +       if (owner) {
5376 +               owner->header_used += buffer_size;
5377 +               toi_message(TOI_HEADER, TOI_LOW, 1,
5378 +                       "Header: %s : %d bytes (%d/%d).\n",
5379 +                       buffer_size, owner->header_used,
5380 +                       owner->header_requested);
5381 +               if (owner->header_used > owner->header_requested) {
5382 +                       printk(KERN_EMERG "TuxOnIce module %s is using more "
5383 +                               "header space (%u) than it requested (%u).\n",
5384 +                               owner->name,
5385 +                               owner->header_used,
5386 +                               owner->header_requested);
5387 +                       return buffer_size;
5388 +               }
5389 +       } else
5390 +               toi_message(TOI_HEADER, TOI_LOW, 1,
5391 +                       "Header: (No owner): %d bytes.\n", buffer_size);
5392 +
5393 +       if (!writing && !no_readahead)
5394 +               result = toi_start_new_readahead(0);
5395 +
5396 +       if (!result)
5397 +               result = toi_rw_buffer(writing, buffer, buffer_size, no_readahead);
5398 +
5399 +       return result;
5400 +}
5401 +
5402 +static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
5403 +               char *buffer, int size)
5404 +{
5405 +       return _toi_rw_header_chunk(writing, owner, buffer, size, 0);
5406 +}
5407 +
5408 +static int toi_rw_header_chunk_noreadahead(int writing,
5409 +               struct toi_module_ops *owner, char *buffer, int size)
5410 +{
5411 +       return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
5412 +}
5413 +
5414 +/**
5415 + * write_header_chunk_finish: Flush any buffered header data.
5416 + */
5417 +static int write_header_chunk_finish(void)
5418 +{
5419 +       int result = 0;
5420 +
5421 +       if (toi_writer_buffer_posn)
5422 +               toi_bio_queue_write(&toi_writer_buffer);
5423 +
5424 +       toi_bio_queue_flush_pages(0);
5425 +       toi_finish_all_io();
5426 +
5427 +       return result;
5428 +}
5429 +
5430 +/**
5431 + * toi_bio_storage_needed: Get the amount of storage needed for my fns.
5432 + */
5433 +static int toi_bio_storage_needed(void)
5434 +{
5435 +       return 2 * sizeof(int);
5436 +}
5437 +
5438 +/**
5439 + * toi_bio_save_config_info: Save block i/o config to image header.
5440 + *
5441 + * @buf: PAGE_SIZE'd buffer into which data should be saved.
5442 + */
5443 +static int toi_bio_save_config_info(char *buf)
5444 +{
5445 +       int *ints = (int *) buf;
5446 +       ints[0] = target_outstanding_io;
5447 +       return sizeof(int);
5448 +}
5449 +
5450 +/**
5451 + * toi_bio_load_config_info: Restore block i/o config.
5452 + *
5453 + * @buf: Data to be reloaded.
5454 + * @size: Size of the buffer saved.
5455 + */
5456 +static void toi_bio_load_config_info(char *buf, int size)
5457 +{
5458 +       int *ints = (int *) buf;
5459 +       target_outstanding_io  = ints[0];
5460 +}
5461 +
5462 +/**
5463 + * toi_bio_initialise: Initialise bio code at start of some action.
5464 + *
5465 + * @starting_cycle: Whether starting a hibernation cycle, or just reading or
5466 + * writing a sysfs value.
5467 + */
5468 +static int toi_bio_initialise(int starting_cycle)
5469 +{
5470 +       if (starting_cycle) {
5471 +               max_outstanding_writes = 0;
5472 +               max_outstanding_reads = 0;
5473 +               toi_queue_flusher = current;
5474 +#ifdef MEASURE_MUTEX_CONTENTION
5475 +               {
5476 +               int i, j, k;
5477 +
5478 +               for (i = 0; i < 2; i++)
5479 +                       for (j = 0; j < 2; j++)
5480 +                               for (k = 0; k < NR_CPUS; k++)
5481 +                                       mutex_times[i][j][k] = 0;
5482 +               }
5483 +#endif
5484 +       }
5485 +
5486 +       return 0;
5487 +}
5488 +
5489 +/**
5490 + * toi_bio_cleanup: Cleanup after some action.
5491 + *
5492 + * @finishing_cycle: Whether completing a cycle.
5493 + */
5494 +static void toi_bio_cleanup(int finishing_cycle)
5495 +{
5496 +       if (toi_writer_buffer) {
5497 +               toi_free_page(11, (unsigned long) toi_writer_buffer);
5498 +               toi_writer_buffer = NULL;
5499 +       }
5500 +}
5501 +
5502 +struct toi_bio_ops toi_bio_ops = {
5503 +       .bdev_page_io = toi_bdev_page_io,
5504 +       .finish_all_io = toi_finish_all_io,
5505 +       .forward_one_page = go_next_page,
5506 +       .set_extra_page_forward = set_extra_page_forward,
5507 +       .set_devinfo = toi_set_devinfo,
5508 +       .read_page = toi_bio_read_page,
5509 +       .write_page = toi_bio_write_page,
5510 +       .rw_init = toi_rw_init,
5511 +       .rw_cleanup = toi_rw_cleanup,
5512 +       .read_header_init = toi_read_header_init,
5513 +       .rw_header_chunk = toi_rw_header_chunk,
5514 +       .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead,
5515 +       .write_header_chunk_finish = write_header_chunk_finish,
5516 +       .io_flusher = bio_io_flusher,
5517 +};
5518 +
5519 +static struct toi_sysfs_data sysfs_params[] = {
5520 +       { TOI_ATTR("target_outstanding_io", SYSFS_RW),
5521 +         SYSFS_INT(&target_outstanding_io, 0, TARGET_OUTSTANDING_IO, 0),
5522 +       }
5523 +};
5524 +
5525 +static struct toi_module_ops toi_blockwriter_ops = {
5526 +       .name                                   = "lowlevel i/o",
5527 +       .type                                   = MISC_HIDDEN_MODULE,
5528 +       .directory                              = "block_io",
5529 +       .module                                 = THIS_MODULE,
5530 +       .print_debug_info                       = toi_bio_print_debug_stats,
5531 +       .memory_needed                          = toi_bio_memory_needed,
5532 +       .storage_needed                         = toi_bio_storage_needed,
5533 +       .save_config_info                       = toi_bio_save_config_info,
5534 +       .load_config_info                       = toi_bio_load_config_info,
5535 +       .initialise                             = toi_bio_initialise,
5536 +       .cleanup                                = toi_bio_cleanup,
5537 +
5538 +       .sysfs_data             = sysfs_params,
5539 +       .num_sysfs_entries      = sizeof(sysfs_params) /
5540 +               sizeof(struct toi_sysfs_data),
5541 +};
5542 +
5543 +/**
5544 + * toi_block_io_load: Load time routine for block i/o module.
5545 + *
5546 + * Register block i/o ops and sysfs entries.
5547 + */
5548 +static __init int toi_block_io_load(void)
5549 +{
5550 +       return toi_register_module(&toi_blockwriter_ops);
5551 +}
5552 +
5553 +#if defined(CONFIG_TOI_FILE_EXPORTS) || defined(CONFIG_TOI_SWAP_EXPORTS)
5554 +EXPORT_SYMBOL_GPL(toi_writer_posn);
5555 +EXPORT_SYMBOL_GPL(toi_writer_posn_save);
5556 +EXPORT_SYMBOL_GPL(toi_writer_buffer);
5557 +EXPORT_SYMBOL_GPL(toi_writer_buffer_posn);
5558 +EXPORT_SYMBOL_GPL(toi_bio_ops);
5559 +#endif
5560 +#ifdef MODULE
5561 +static __exit void toi_block_io_unload(void)
5562 +{
5563 +       toi_unregister_module(&toi_blockwriter_ops);
5564 +}
5565 +
5566 +module_init(toi_block_io_load);
5567 +module_exit(toi_block_io_unload);
5568 +MODULE_LICENSE("GPL");
5569 +MODULE_AUTHOR("Nigel Cunningham");
5570 +MODULE_DESCRIPTION("TuxOnIce block io functions");
5571 +#else
5572 +late_initcall(toi_block_io_load);
5573 +#endif
5574 diff --git a/kernel/power/tuxonice_block_io.h b/kernel/power/tuxonice_block_io.h
5575 new file mode 100644
5576 index 0000000..833a685
5577 --- /dev/null
5578 +++ b/kernel/power/tuxonice_block_io.h
5579 @@ -0,0 +1,57 @@
5580 +/*
5581 + * kernel/power/tuxonice_block_io.h
5582 + *
5583 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
5584 + * Copyright (C) 2006 Red Hat, inc.
5585 + *
5586 + * Distributed under GPLv2.
5587 + *
5588 + * This file contains declarations for functions exported from
5589 + * tuxonice_block_io.c, which contains low level io functions.
5590 + */
5591 +
5592 +#include <linux/buffer_head.h>
5593 +#include "tuxonice_extent.h"
5594 +
5595 +struct toi_bdev_info {
5596 +       struct block_device *bdev;
5597 +       dev_t dev_t;
5598 +       int bmap_shift;
5599 +       int blocks_per_page;
5600 +};
5601 +
5602 +/*
5603 + * Our exported interface so the swapwriter and filewriter don't
5604 + * need these functions duplicated.
5605 + */
5606 +struct toi_bio_ops {
5607 +       int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
5608 +                       struct page *page);
5609 +       void (*check_io_stats) (void);
5610 +       void (*reset_io_stats) (void);
5611 +       void (*finish_all_io) (void);
5612 +       int (*forward_one_page) (int writing);
5613 +       void (*set_extra_page_forward) (void);
5614 +       void (*set_devinfo) (struct toi_bdev_info *info);
5615 +       int (*read_page) (unsigned long *index, struct page *buffer_page,
5616 +                       unsigned int *buf_size);
5617 +       int (*write_page) (unsigned long index, struct page *buffer_page,
5618 +                       unsigned int buf_size);
5619 +       void (*read_header_init) (void);
5620 +       int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
5621 +                       char *buffer, int buffer_size);
5622 +       int (*rw_header_chunk_noreadahead) (int rw,
5623 +                       struct toi_module_ops *owner,
5624 +                       char *buffer, int buffer_size);
5625 +       int (*write_header_chunk_finish) (void);
5626 +       int (*rw_init) (int rw, int stream_number);
5627 +       int (*rw_cleanup) (int rw);
5628 +       void (*io_flusher) (int rw);
5629 +};
5630 +
5631 +extern struct toi_bio_ops toi_bio_ops;
5632 +
5633 +extern char *toi_writer_buffer;
5634 +extern int toi_writer_buffer_posn;
5635 +extern struct extent_iterate_saved_state toi_writer_posn_save[4];
5636 +extern struct extent_iterate_state toi_writer_posn;
5637 diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
5638 new file mode 100644
5639 index 0000000..11cf575
5640 --- /dev/null
5641 +++ b/kernel/power/tuxonice_builtin.c
5642 @@ -0,0 +1,400 @@
5643 +/*
5644 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
5645 + *
5646 + * This file is released under the GPLv2.
5647 + */
5648 +#include <linux/module.h>
5649 +#include <linux/resume-trace.h>
5650 +#include <linux/syscalls.h>
5651 +#include <linux/kernel.h>
5652 +#include <linux/swap.h>
5653 +#include <linux/syscalls.h>
5654 +#include <linux/bio.h>
5655 +#include <linux/root_dev.h>
5656 +#include <linux/freezer.h>
5657 +#include <linux/reboot.h>
5658 +#include <linux/writeback.h>
5659 +#include <linux/tty.h>
5660 +#include <linux/crypto.h>
5661 +#include <linux/cpu.h>
5662 +#include <linux/dyn_pageflags.h>
5663 +#include <linux/ctype.h>
5664 +#include "tuxonice_io.h"
5665 +#include "tuxonice.h"
5666 +#include "tuxonice_extent.h"
5667 +#include "tuxonice_block_io.h"
5668 +#include "tuxonice_netlink.h"
5669 +#include "tuxonice_prepare_image.h"
5670 +#include "tuxonice_ui.h"
5671 +#include "tuxonice_sysfs.h"
5672 +#include "tuxonice_pagedir.h"
5673 +#include "tuxonice_modules.h"
5674 +#include "tuxonice_builtin.h"
5675 +#include "tuxonice_power_off.h"
5676 +#include "power.h"
5677 +
5678 +/*
5679 + * Highmem related functions (x86 only).
5680 + */
5681 +
5682 +#ifdef CONFIG_HIGHMEM
5683 +
5684 +/**
5685 + * copyback_high: Restore highmem pages.
5686 + *
5687 + * Highmem data and pbe lists are/can be stored in highmem.
5688 + * The format is slightly different to the lowmem pbe lists
5689 + * used for the assembly code: the last pbe in each page is
5690 + * a struct page * instead of struct pbe *, pointing to the
5691 + * next page where pbes are stored (or NULL if happens to be
5692 + * the end of the list). Since we don't want to generate
5693 + * unnecessary deltas against swsusp code, we use a cast
5694 + * instead of a union.
5695 + **/
5696 +
5697 +static void copyback_high(void)
5698 +{
5699 +       struct page *pbe_page = (struct page *) restore_highmem_pblist;
5700 +       struct pbe *this_pbe, *first_pbe;
5701 +       unsigned long *origpage, *copypage;
5702 +       int pbe_index = 1;
5703 +
5704 +       if (!pbe_page)
5705 +               return;
5706 +
5707 +       this_pbe = (struct pbe *) kmap_atomic(pbe_page, KM_BOUNCE_READ);
5708 +       first_pbe = this_pbe;
5709 +
5710 +       while (this_pbe) {
5711 +               int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
5712 +
5713 +               origpage = kmap_atomic((struct page *) this_pbe->orig_address,
5714 +                       KM_BIO_DST_IRQ);
5715 +               copypage = kmap_atomic((struct page *) this_pbe->address,
5716 +                       KM_BIO_SRC_IRQ);
5717 +
5718 +               while (loop >= 0) {
5719 +                       *(origpage + loop) = *(copypage + loop);
5720 +                       loop--;
5721 +               }
5722 +
5723 +               kunmap_atomic(origpage, KM_BIO_DST_IRQ);
5724 +               kunmap_atomic(copypage, KM_BIO_SRC_IRQ);
5725 +
5726 +               if (!this_pbe->next)
5727 +                       break;
5728 +
5729 +               if (pbe_index < PBES_PER_PAGE) {
5730 +                       this_pbe++;
5731 +                       pbe_index++;
5732 +               } else {
5733 +                       pbe_page = (struct page *) this_pbe->next;
5734 +                       kunmap_atomic(first_pbe, KM_BOUNCE_READ);
5735 +                       if (!pbe_page)
5736 +                               return;
5737 +                       this_pbe = (struct pbe *) kmap_atomic(pbe_page,
5738 +                                       KM_BOUNCE_READ);
5739 +                       first_pbe = this_pbe;
5740 +                       pbe_index = 1;
5741 +               }
5742 +       }
5743 +       kunmap_atomic(first_pbe, KM_BOUNCE_READ);
5744 +}
5745 +
5746 +#else /* CONFIG_HIGHMEM */
5747 +void copyback_high(void) { }
5748 +#endif
5749 +
5750 +char toi_wait_for_keypress_dev_console(int timeout)
5751 +{
5752 +       int fd, this_timeout = 255;
5753 +       char key = '\0';
5754 +       struct termios t, t_backup;
5755 +
5756 +       /* We should be guaranteed /dev/console exists after populate_rootfs()
5757 +        * in init/main.c.
5758 +        */
5759 +       fd = sys_open("/dev/console", O_RDONLY, 0);
5760 +       if (fd < 0) {
5761 +               printk(KERN_INFO "Couldn't open /dev/console.\n");
5762 +               return key;
5763 +       }
5764 +
5765 +       if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
5766 +               goto out_close;
5767 +
5768 +       memcpy(&t_backup, &t, sizeof(t));
5769 +
5770 +       t.c_lflag &= ~(ISIG|ICANON|ECHO);
5771 +       t.c_cc[VMIN] = 0;
5772 +
5773 +new_timeout:
5774 +       if (timeout > 0) {
5775 +               this_timeout = timeout < 26 ? timeout : 25;
5776 +               timeout -= this_timeout;
5777 +               this_timeout *= 10;
5778 +       }
5779 +
5780 +       t.c_cc[VTIME] = this_timeout;
5781 +
5782 +       if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
5783 +               goto out_restore;
5784 +
5785 +       while (1) {
5786 +               if (sys_read(fd, &key, 1) <= 0) {
5787 +                       if (timeout)
5788 +                               goto new_timeout;
5789 +                       key = '\0';
5790 +                       break;
5791 +               }
5792 +               key = tolower(key);
5793 +               if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
5794 +                       if (key == 'c') {
5795 +                               set_toi_state(TOI_CONTINUE_REQ);
5796 +                               break;
5797 +                       } else if (key == ' ')
5798 +                               break;
5799 +               } else
5800 +                       break;
5801 +       }
5802 +
5803 +out_restore:
5804 +       sys_ioctl(fd, TCSETS, (long)&t_backup);
5805 +out_close:
5806 +       sys_close(fd);
5807 +
5808 +       return key;
5809 +}
5810 +
5811 +struct toi_boot_kernel_data toi_bkd __nosavedata
5812 +               __attribute__((aligned(PAGE_SIZE))) = {
5813 +       MY_BOOT_KERNEL_DATA_VERSION,
5814 +       0,
5815 +#ifdef CONFIG_TOI_REPLACE_SWSUSP
5816 +       (1 << TOI_REPLACE_SWSUSP) |
5817 +#endif
5818 +       (1 << TOI_PAGESET2_FULL) | (1 << TOI_LATE_CPU_HOTPLUG),
5819 +};
5820 +EXPORT_SYMBOL_GPL(toi_bkd);
5821 +
5822 +struct block_device *toi_open_by_devnum(dev_t dev, unsigned mode)
5823 +{
5824 +       struct block_device *bdev = bdget(dev);
5825 +       int err = -ENOMEM;
5826 +       int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY;
5827 +       flags |= O_NONBLOCK;
5828 +       if (bdev)
5829 +               err = blkdev_get(bdev, mode, flags);
5830 +       return err ? ERR_PTR(err) : bdev;
5831 +}
5832 +EXPORT_SYMBOL_GPL(toi_open_by_devnum);
5833 +
5834 +EXPORT_SYMBOL_GPL(toi_wait_for_keypress_dev_console);
5835 +EXPORT_SYMBOL_GPL(hibernation_platform_enter);
5836 +EXPORT_SYMBOL_GPL(platform_begin);
5837 +EXPORT_SYMBOL_GPL(platform_pre_snapshot);
5838 +EXPORT_SYMBOL_GPL(platform_leave);
5839 +EXPORT_SYMBOL_GPL(platform_end);
5840 +EXPORT_SYMBOL_GPL(platform_finish);
5841 +EXPORT_SYMBOL_GPL(platform_pre_restore);
5842 +EXPORT_SYMBOL_GPL(platform_restore_cleanup);
5843 +EXPORT_SYMBOL_GPL(power_kobj);
5844 +EXPORT_SYMBOL_GPL(pm_notifier_call_chain);
5845 +EXPORT_SYMBOL_GPL(init_swsusp_header);
5846 +
5847 +#ifdef CONFIG_ARCH_HIBERNATION_HEADER
5848 +EXPORT_SYMBOL_GPL(arch_hibernation_header_save);
5849 +EXPORT_SYMBOL_GPL(arch_hibernation_header_restore);
5850 +#endif
5851 +
5852 +#ifdef CONFIG_TOI_CORE_EXPORTS
5853 +#ifdef CONFIG_X86_64
5854 +EXPORT_SYMBOL_GPL(restore_processor_state);
5855 +EXPORT_SYMBOL_GPL(save_processor_state);
5856 +#endif
5857 +
5858 +EXPORT_SYMBOL_GPL(drop_pagecache);
5859 +EXPORT_SYMBOL_GPL(restore_pblist);
5860 +EXPORT_SYMBOL_GPL(pm_mutex);
5861 +EXPORT_SYMBOL_GPL(pm_restore_console);
5862 +EXPORT_SYMBOL_GPL(super_blocks);
5863 +EXPORT_SYMBOL_GPL(next_zone);
5864 +
5865 +EXPORT_SYMBOL_GPL(freeze_processes);
5866 +EXPORT_SYMBOL_GPL(thaw_processes);
5867 +EXPORT_SYMBOL_GPL(thaw_kernel_threads);
5868 +EXPORT_SYMBOL_GPL(shrink_all_memory);
5869 +EXPORT_SYMBOL_GPL(shrink_one_zone);
5870 +EXPORT_SYMBOL_GPL(saveable_page);
5871 +EXPORT_SYMBOL_GPL(swsusp_arch_suspend);
5872 +EXPORT_SYMBOL_GPL(swsusp_arch_resume);
5873 +EXPORT_SYMBOL_GPL(pm_prepare_console);
5874 +EXPORT_SYMBOL_GPL(follow_page);
5875 +EXPORT_SYMBOL_GPL(machine_halt);
5876 +EXPORT_SYMBOL_GPL(block_dump);
5877 +EXPORT_SYMBOL_GPL(unlink_lru_lists);
5878 +EXPORT_SYMBOL_GPL(relink_lru_lists);
5879 +EXPORT_SYMBOL_GPL(machine_power_off);
5880 +EXPORT_SYMBOL_GPL(suspend_devices_and_enter);
5881 +EXPORT_SYMBOL_GPL(first_online_pgdat);
5882 +EXPORT_SYMBOL_GPL(next_online_pgdat);
5883 +EXPORT_SYMBOL_GPL(machine_restart);
5884 +EXPORT_SYMBOL_GPL(saved_command_line);
5885 +EXPORT_SYMBOL_GPL(tasklist_lock);
5886 +#ifdef CONFIG_PM_SLEEP_SMP
5887 +EXPORT_SYMBOL_GPL(disable_nonboot_cpus);
5888 +EXPORT_SYMBOL_GPL(enable_nonboot_cpus);
5889 +#endif
5890 +#endif
5891 +
5892 +int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
5893 +
5894 +#ifdef CONFIG_TOI_USERUI_EXPORTS
5895 +EXPORT_SYMBOL_GPL(kmsg_redirect);
5896 +#endif
5897 +EXPORT_SYMBOL_GPL(toi_wait);
5898 +
5899 +#if defined(CONFIG_TOI_USERUI_EXPORTS) || defined(CONFIG_TOI_CORE_EXPORTS)
5900 +EXPORT_SYMBOL_GPL(console_printk);
5901 +#endif
5902 +#ifdef CONFIG_TOI_SWAP_EXPORTS /* TuxOnIce swap specific */
5903 +EXPORT_SYMBOL_GPL(sys_swapon);
5904 +EXPORT_SYMBOL_GPL(sys_swapoff);
5905 +EXPORT_SYMBOL_GPL(si_swapinfo);
5906 +EXPORT_SYMBOL_GPL(map_swap_page);
5907 +EXPORT_SYMBOL_GPL(get_swap_page);
5908 +EXPORT_SYMBOL_GPL(swap_free);
5909 +EXPORT_SYMBOL_GPL(get_swap_info_struct);
5910 +#endif
5911 +
5912 +#ifdef CONFIG_TOI_FILE_EXPORTS
5913 +/* TuxOnice file allocator specific support */
5914 +EXPORT_SYMBOL_GPL(sys_unlink);
5915 +EXPORT_SYMBOL_GPL(sys_mknod);
5916 +#endif
5917 +
5918 +/* Swap or file */
5919 +#if defined(CONFIG_TOI_FILE_EXPORTS) || defined(CONFIG_TOI_SWAP_EXPORTS)
5920 +EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
5921 +EXPORT_SYMBOL_GPL(name_to_dev_t);
5922 +#endif
5923 +
5924 +#if defined(CONFIG_TOI_FILE_EXPORTS) || defined(CONFIG_TOI_SWAP_EXPORTS) || \
5925 +       defined(CONFIG_TOI_CORE_EXPORTS)
5926 +EXPORT_SYMBOL_GPL(resume_file);
5927 +#endif
5928 +struct toi_core_fns *toi_core_fns;
5929 +EXPORT_SYMBOL_GPL(toi_core_fns);
5930 +
5931 +DECLARE_DYN_PAGEFLAGS(pageset1_map);
5932 +DECLARE_DYN_PAGEFLAGS(pageset1_copy_map);
5933 +EXPORT_SYMBOL_GPL(pageset1_map);
5934 +EXPORT_SYMBOL_GPL(pageset1_copy_map);
5935 +
5936 +unsigned long toi_result;
5937 +struct pagedir pagedir1 = {1};
5938 +
5939 +EXPORT_SYMBOL_GPL(toi_result);
5940 +EXPORT_SYMBOL_GPL(pagedir1);
5941 +
5942 +unsigned long toi_get_nonconflicting_page(void)
5943 +{
5944 +       return toi_core_fns->get_nonconflicting_page();
5945 +}
5946 +
5947 +int toi_post_context_save(void)
5948 +{
5949 +       return toi_core_fns->post_context_save();
5950 +}
5951 +
5952 +int toi_try_hibernate(int have_pmsem)
5953 +{
5954 +       if (!toi_core_fns)
5955 +               return -ENODEV;
5956 +
5957 +       return toi_core_fns->try_hibernate(have_pmsem);
5958 +}
5959 +
5960 +void toi_try_resume(void)
5961 +{
5962 +       if (toi_core_fns)
5963 +               toi_core_fns->try_resume();
5964 +       else
5965 +               printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
5966 +}
5967 +
5968 +int toi_lowlevel_builtin(void)
5969 +{
5970 +       int error = 0;
5971 +
5972 +       save_processor_state();
5973 +       error = swsusp_arch_suspend();
5974 +       if (error)
5975 +               printk(KERN_ERR "Error %d hibernating\n", error);
5976 +
5977 +       /* Restore control flow appears here */
5978 +       if (!toi_in_hibernate) {
5979 +               copyback_high();
5980 +               set_toi_state(TOI_NOW_RESUMING);
5981 +       }
5982 +
5983 +       restore_processor_state();
5984 +
5985 +       return error;
5986 +}
5987 +
5988 +EXPORT_SYMBOL_GPL(toi_lowlevel_builtin);
5989 +
5990 +unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
5991 +EXPORT_SYMBOL_GPL(toi_compress_bytes_in);
5992 +EXPORT_SYMBOL_GPL(toi_compress_bytes_out);
5993 +
5994 +unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
5995 +               (1 << TOI_IGNORE_LOGLEVEL) |
5996 +               (1 << TOI_IO_STOPPED));
5997 +EXPORT_SYMBOL_GPL(toi_state);
5998 +
5999 +/* The number of hibernates we have started (some may have been cancelled) */
6000 +unsigned int nr_hibernates;
6001 +EXPORT_SYMBOL_GPL(nr_hibernates);
6002 +
6003 +int toi_running;
6004 +EXPORT_SYMBOL_GPL(toi_running);
6005 +
6006 +int toi_in_hibernate __nosavedata;
6007 +EXPORT_SYMBOL_GPL(toi_in_hibernate);
6008 +
6009 +__nosavedata struct pbe *restore_highmem_pblist;
6010 +
6011 +#ifdef CONFIG_TOI_CORE_EXPORTS
6012 +#ifdef CONFIG_HIGHMEM
6013 +EXPORT_SYMBOL_GPL(nr_free_highpages);
6014 +EXPORT_SYMBOL_GPL(saveable_highmem_page);
6015 +EXPORT_SYMBOL_GPL(restore_highmem_pblist);
6016 +#endif
6017 +#endif
6018 +
6019 +#if defined(CONFIG_TOI_CORE_EXPORTS) || defined(CONFIG_TOI_PAGEFLAGS_EXPORTS)
6020 +EXPORT_SYMBOL_GPL(max_pfn);
6021 +#endif
6022 +
6023 +#if defined(CONFIG_TOI_EXPORTS) || defined(CONFIG_TOI_CORE_EXPORTS)
6024 +EXPORT_SYMBOL_GPL(snprintf_used);
6025 +#endif
6026 +
6027 +static int __init toi_wait_setup(char *str)
6028 +{
6029 +       int value;
6030 +
6031 +       if (sscanf(str, "=%d", &value)) {
6032 +               if (value < -1 || value > 255)
6033 +                       printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
6034 +                                       "255.\n");
6035 +               else
6036 +                       toi_wait = value;
6037 +       }
6038 +
6039 +       return 1;
6040 +}
6041 +
6042 +__setup("toi_wait", toi_wait_setup);
6043 diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
6044 new file mode 100644
6045 index 0000000..f4966c4
6046 --- /dev/null
6047 +++ b/kernel/power/tuxonice_builtin.h
6048 @@ -0,0 +1,31 @@
6049 +/*
6050 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
6051 + *
6052 + * This file is released under the GPLv2.
6053 + */
6054 +#include <linux/dyn_pageflags.h>
6055 +#include <asm/setup.h>
6056 +
6057 +extern struct toi_core_fns *toi_core_fns;
6058 +extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
6059 +extern unsigned int nr_hibernates;
6060 +extern int toi_in_hibernate;
6061 +
6062 +extern __nosavedata struct pbe *restore_highmem_pblist;
6063 +
6064 +int toi_lowlevel_builtin(void);
6065 +
6066 +extern struct dyn_pageflags __nosavedata toi_nosave_origmap;
6067 +extern struct dyn_pageflags __nosavedata toi_nosave_copymap;
6068 +
6069 +#ifdef CONFIG_HIGHMEM
6070 +extern __nosavedata struct zone_data *toi_nosave_zone_list;
6071 +extern __nosavedata unsigned long toi_nosave_max_pfn;
6072 +#endif
6073 +
6074 +extern unsigned long toi_get_nonconflicting_page(void);
6075 +extern int toi_post_context_save(void);
6076 +extern int toi_try_hibernate(int have_pmsem);
6077 +extern char toi_wait_for_keypress_dev_console(int timeout);
6078 +extern struct block_device *toi_open_by_devnum(dev_t dev, unsigned mode);
6079 +extern int toi_wait;
6080 diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
6081 new file mode 100644
6082 index 0000000..eea3029
6083 --- /dev/null
6084 +++ b/kernel/power/tuxonice_checksum.c
6085 @@ -0,0 +1,389 @@
6086 +/*
6087 + * kernel/power/tuxonice_checksum.c
6088 + *
6089 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
6090 + * Copyright (C) 2006 Red Hat, inc.
6091 + *
6092 + * This file is released under the GPLv2.
6093 + *
6094 + * This file contains data checksum routines for TuxOnIce,
6095 + * using cryptoapi. They are used to locate any modifications
6096 + * made to pageset 2 while we're saving it.
6097 + */
6098 +
6099 +#include <linux/suspend.h>
6100 +#include <linux/module.h>
6101 +#include <linux/highmem.h>
6102 +#include <linux/vmalloc.h>
6103 +#include <linux/crypto.h>
6104 +#include <linux/scatterlist.h>
6105 +
6106 +#include "tuxonice.h"
6107 +#include "tuxonice_modules.h"
6108 +#include "tuxonice_sysfs.h"
6109 +#include "tuxonice_io.h"
6110 +#include "tuxonice_pageflags.h"
6111 +#include "tuxonice_checksum.h"
6112 +#include "tuxonice_pagedir.h"
6113 +#include "tuxonice_alloc.h"
6114 +
6115 +static struct toi_module_ops toi_checksum_ops;
6116 +
6117 +/* Constant at the mo, but I might allow tuning later */
6118 +static char toi_checksum_name[32] = "md4";
6119 +/* Bytes per checksum */
6120 +#define CHECKSUM_SIZE (16)
6121 +
6122 +#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
6123 +
6124 +struct cpu_context {
6125 +       struct crypto_hash *transform;
6126 +       struct hash_desc desc;
6127 +       struct scatterlist sg[2];
6128 +       char *buf;
6129 +};
6130 +
6131 +static DEFINE_PER_CPU(struct cpu_context, contexts);
6132 +static int pages_allocated;
6133 +static unsigned long page_list;
6134 +
6135 +static int toi_num_resaved;
6136 +
6137 +static unsigned long this_checksum, next_page;
6138 +static int checksum_index;
6139 +
6140 +static inline int checksum_pages_needed(void)
6141 +{
6142 +       return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
6143 +}
6144 +
6145 +/* ---- Local buffer management ---- */
6146 +
6147 +/*
6148 + * toi_checksum_cleanup
6149 + *
6150 + * Frees memory allocated for our labours.
6151 + */
6152 +static void toi_checksum_cleanup(int ending_cycle)
6153 +{
6154 +       int cpu;
6155 +
6156 +       if (ending_cycle) {
6157 +               for_each_online_cpu(cpu) {
6158 +                       struct cpu_context *this = &per_cpu(contexts, cpu);
6159 +                       if (this->transform) {
6160 +                               crypto_free_hash(this->transform);
6161 +                               this->transform = NULL;
6162 +                               this->desc.tfm = NULL;
6163 +                       }
6164 +
6165 +                       if (this->buf) {
6166 +                               toi_free_page(27, (unsigned long) this->buf);
6167 +                               this->buf = NULL;
6168 +                       }
6169 +               }
6170 +       }
6171 +}
6172 +
6173 +/*
6174 + * toi_crypto_initialise
6175 + *
6176 + * Prepare to do some work by allocating buffers and transforms.
6177 + * Returns: Int: Zero. Even if we can't set up checksum, we still
6178 + * seek to hibernate.
6179 + */
6180 +static int toi_checksum_initialise(int starting_cycle)
6181 +{
6182 +       int cpu;
6183 +
6184 +       if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
6185 +               return 0;
6186 +
6187 +       if (!*toi_checksum_name) {
6188 +               printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
6189 +               return 1;
6190 +       }
6191 +
6192 +       for_each_online_cpu(cpu) {
6193 +               struct cpu_context *this = &per_cpu(contexts, cpu);
6194 +               struct page *page;
6195 +
6196 +               this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
6197 +               if (IS_ERR(this->transform)) {
6198 +                       printk(KERN_INFO "TuxOnIce: Failed to initialise the "
6199 +                               "%s checksum algorithm: %ld.\n",
6200 +                               toi_checksum_name, (long) this->transform);
6201 +                       this->transform = NULL;
6202 +                       return 1;
6203 +               }
6204 +
6205 +               this->desc.tfm = this->transform;
6206 +               this->desc.flags = 0;
6207 +
6208 +               page = toi_alloc_page(27, GFP_KERNEL);
6209 +               if (!page)
6210 +                       return 1;
6211 +               this->buf = page_address(page);
6212 +               sg_set_buf(&this->sg[0], this->buf, PAGE_SIZE);
6213 +       }
6214 +       return 0;
6215 +}
6216 +
6217 +/*
6218 + * toi_checksum_print_debug_stats
6219 + * @buffer: Pointer to a buffer into which the debug info will be printed.
6220 + * @size: Size of the buffer.
6221 + *
6222 + * Print information to be recorded for debugging purposes into a buffer.
6223 + * Returns: Number of characters written to the buffer.
6224 + */
6225 +
6226 +static int toi_checksum_print_debug_stats(char *buffer, int size)
6227 +{
6228 +       int len;
6229 +
6230 +       if (!toi_checksum_ops.enabled)
6231 +               return snprintf_used(buffer, size,
6232 +                       "- Checksumming disabled.\n");
6233 +
6234 +       len = snprintf_used(buffer, size, "- Checksum method is '%s'.\n",
6235 +                       toi_checksum_name);
6236 +       len += snprintf_used(buffer + len, size - len,
6237 +               "  %d pages resaved in atomic copy.\n", toi_num_resaved);
6238 +       return len;
6239 +}
6240 +
6241 +static int toi_checksum_memory_needed(void)
6242 +{
6243 +       return toi_checksum_ops.enabled ?
6244 +               checksum_pages_needed() << PAGE_SHIFT : 0;
6245 +}
6246 +
6247 +static int toi_checksum_storage_needed(void)
6248 +{
6249 +       if (toi_checksum_ops.enabled)
6250 +               return strlen(toi_checksum_name) + sizeof(int) + 1;
6251 +       else
6252 +               return 0;
6253 +}
6254 +
6255 +/*
6256 + * toi_checksum_save_config_info
6257 + * @buffer: Pointer to a buffer of size PAGE_SIZE.
6258 + *
6259 + * Save informaton needed when reloading the image at resume time.
6260 + * Returns: Number of bytes used for saving our data.
6261 + */
6262 +static int toi_checksum_save_config_info(char *buffer)
6263 +{
6264 +       int namelen = strlen(toi_checksum_name) + 1;
6265 +       int total_len;
6266 +
6267 +       *((unsigned int *) buffer) = namelen;
6268 +       strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
6269 +       total_len = sizeof(unsigned int) + namelen;
6270 +       return total_len;
6271 +}
6272 +
6273 +/* toi_checksum_load_config_info
6274 + * @buffer: Pointer to the start of the data.
6275 + * @size: Number of bytes that were saved.
6276 + *
6277 + * Description:        Reload information needed for dechecksuming the image at
6278 + * resume time.
6279 + */
6280 +static void toi_checksum_load_config_info(char *buffer, int size)
6281 +{
6282 +       int namelen;
6283 +
6284 +       namelen = *((unsigned int *) (buffer));
6285 +       strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
6286 +                       namelen);
6287 +       return;
6288 +}
6289 +
6290 +/*
6291 + * Free Checksum Memory
6292 + */
6293 +
6294 +void free_checksum_pages(void)
6295 +{
6296 +       while (pages_allocated) {
6297 +               unsigned long next = *((unsigned long *) page_list);
6298 +               ClearPageNosave(virt_to_page(page_list));
6299 +               toi_free_page(15, (unsigned long) page_list);
6300 +               page_list = next;
6301 +               pages_allocated--;
6302 +       }
6303 +}
6304 +
6305 +/*
6306 + * Allocate Checksum Memory
6307 + */
6308 +
6309 +int allocate_checksum_pages(void)
6310 +{
6311 +       int pages_needed = checksum_pages_needed();
6312 +
6313 +       if (!toi_checksum_ops.enabled)
6314 +               return 0;
6315 +
6316 +       while (pages_allocated < pages_needed) {
6317 +               unsigned long *new_page =
6318 +                 (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
6319 +               if (!new_page) {
6320 +                       printk("Unable to allocate checksum pages.\n");
6321 +                       return -ENOMEM;
6322 +               }
6323 +               SetPageNosave(virt_to_page(new_page));
6324 +               (*new_page) = page_list;
6325 +               page_list = (unsigned long) new_page;
6326 +               pages_allocated++;
6327 +       }
6328 +
6329 +       next_page = (unsigned long) page_list;
6330 +       checksum_index = 0;
6331 +
6332 +       return 0;
6333 +}
6334 +
6335 +#if 0
6336 +static void print_checksum(char *buf, int size)
6337 +{
6338 +       int index;
6339 +
6340 +       for (index = 0; index < size; index++)
6341 +               printk(KERN_INFO "%x ", buf[index]);
6342 +
6343 +       printk("\n");
6344 +}
6345 +#endif
6346 +
6347 +char *tuxonice_get_next_checksum(void)
6348 +{
6349 +       if (!toi_checksum_ops.enabled)
6350 +               return NULL;
6351 +
6352 +       if (checksum_index % CHECKSUMS_PER_PAGE)
6353 +               this_checksum += CHECKSUM_SIZE;
6354 +       else {
6355 +               this_checksum = next_page + sizeof(void *);
6356 +               next_page = *((unsigned long *) next_page);
6357 +       }
6358 +
6359 +       checksum_index++;
6360 +       return (char *) this_checksum;
6361 +}
6362 +
6363 +int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
6364 +{
6365 +       char *pa;
6366 +       int result, cpu = smp_processor_id();
6367 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
6368 +
6369 +       if (!toi_checksum_ops.enabled)
6370 +               return 0;
6371 +
6372 +       pa = kmap(page);
6373 +       memcpy(ctx->buf, pa, PAGE_SIZE);
6374 +       kunmap(page);
6375 +       result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
6376 +                                               checksum_locn);
6377 +       return result;
6378 +}
6379 +/*
6380 + * Calculate checksums
6381 + */
6382 +
6383 +void check_checksums(void)
6384 +{
6385 +       int pfn, index = 0, cpu = smp_processor_id();
6386 +       unsigned long next_page, this_checksum = 0;
6387 +       char current_checksum[CHECKSUM_SIZE];
6388 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
6389 +
6390 +       if (!toi_checksum_ops.enabled)
6391 +               return;
6392 +
6393 +       next_page = (unsigned long) page_list;
6394 +
6395 +       toi_num_resaved = 0;
6396 +
6397 +       BITMAP_FOR_EACH_SET(&pageset2_map, pfn) {
6398 +               int ret;
6399 +               char *pa;
6400 +               struct page *page = pfn_to_page(pfn);
6401 +
6402 +               if (index % CHECKSUMS_PER_PAGE) {
6403 +                       this_checksum += CHECKSUM_SIZE;
6404 +               } else {
6405 +                       this_checksum = next_page + sizeof(void *);
6406 +                       next_page = *((unsigned long *) next_page);
6407 +               }
6408 +
6409 +               /* Done when IRQs disabled so must be atomic */
6410 +               pa = kmap_atomic(page, KM_USER1);
6411 +               memcpy(ctx->buf, pa, PAGE_SIZE);
6412 +               kunmap_atomic(pa, KM_USER1);
6413 +               ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
6414 +                                                       current_checksum);
6415 +
6416 +               if (ret) {
6417 +                       printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
6418 +                       return;
6419 +               }
6420 +
6421 +               if (memcmp(current_checksum, (char *) this_checksum,
6422 +                                                       CHECKSUM_SIZE)) {
6423 +                       SetPageResave(pfn_to_page(pfn));
6424 +                       toi_num_resaved++;
6425 +                       if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
6426 +                               set_abort_result(TOI_RESAVE_NEEDED);
6427 +               }
6428 +
6429 +               index++;
6430 +       }
6431 +}
6432 +
6433 +static struct toi_sysfs_data sysfs_params[] = {
6434 +       { TOI_ATTR("enabled", SYSFS_RW),
6435 +         SYSFS_INT(&toi_checksum_ops.enabled, 0, 1, 0)
6436 +       },
6437 +
6438 +       { TOI_ATTR("abort_if_resave_needed", SYSFS_RW),
6439 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_ABORT_ON_RESAVE_NEEDED, 0)
6440 +       }
6441 +};
6442 +
6443 +/*
6444 + * Ops structure.
6445 + */
6446 +static struct toi_module_ops toi_checksum_ops = {
6447 +       .type                   = MISC_MODULE,
6448 +       .name                   = "checksumming",
6449 +       .directory              = "checksum",
6450 +       .module                 = THIS_MODULE,
6451 +       .initialise             = toi_checksum_initialise,
6452 +       .cleanup                = toi_checksum_cleanup,
6453 +       .print_debug_info       = toi_checksum_print_debug_stats,
6454 +       .save_config_info       = toi_checksum_save_config_info,
6455 +       .load_config_info       = toi_checksum_load_config_info,
6456 +       .memory_needed          = toi_checksum_memory_needed,
6457 +       .storage_needed         = toi_checksum_storage_needed,
6458 +
6459 +       .sysfs_data             = sysfs_params,
6460 +       .num_sysfs_entries      = sizeof(sysfs_params) /
6461 +               sizeof(struct toi_sysfs_data),
6462 +};
6463 +
6464 +/* ---- Registration ---- */
6465 +int toi_checksum_init(void)
6466 +{
6467 +       int result = toi_register_module(&toi_checksum_ops);
6468 +       return result;
6469 +}
6470 +
6471 +void toi_checksum_exit(void)
6472 +{
6473 +       toi_unregister_module(&toi_checksum_ops);
6474 +}
6475 diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
6476 new file mode 100644
6477 index 0000000..81b928d
6478 --- /dev/null
6479 +++ b/kernel/power/tuxonice_checksum.h
6480 @@ -0,0 +1,32 @@
6481 +/*
6482 + * kernel/power/tuxonice_checksum.h
6483 + *
6484 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
6485 + * Copyright (C) 2006 Red Hat, inc.
6486 + *
6487 + * This file is released under the GPLv2.
6488 + *
6489 + * This file contains data checksum routines for TuxOnIce,
6490 + * using cryptoapi. They are used to locate any modifications
6491 + * made to pageset 2 while we're saving it.
6492 + */
6493 +
6494 +#if defined(CONFIG_TOI_CHECKSUM)
6495 +extern int toi_checksum_init(void);
6496 +extern void toi_checksum_exit(void);
6497 +void check_checksums(void);
6498 +int allocate_checksum_pages(void);
6499 +void free_checksum_pages(void);
6500 +char *tuxonice_get_next_checksum(void);
6501 +int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
6502 +#else
6503 +static inline int toi_checksum_init(void) { return 0; }
6504 +static inline void toi_checksum_exit(void) { }
6505 +static inline void check_checksums(void) { };
6506 +static inline int allocate_checksum_pages(void) { return 0; };
6507 +static inline void free_checksum_pages(void) { };
6508 +static inline char *tuxonice_get_next_checksum(void) { return NULL; };
6509 +static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
6510 +       { return 0; }
6511 +#endif
6512 +
6513 diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
6514 new file mode 100644
6515 index 0000000..b5c9ea1
6516 --- /dev/null
6517 +++ b/kernel/power/tuxonice_cluster.c
6518 @@ -0,0 +1,1088 @@
6519 +/*
6520 + * kernel/power/tuxonice_cluster.c
6521 + *
6522 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
6523 + *
6524 + * This file is released under the GPLv2.
6525 + *
6526 + * This file contains routines for cluster hibernation support.
6527 + *
6528 + * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
6529 + *
6530 + * How does it work?
6531 + *
6532 + * There is no 'master' node that tells everyone else what to do. All nodes
6533 + * send messages to the broadcast address/port, maintain a list of peers
6534 + * and figure out when to progress to the next step in hibernating or resuming.
6535 + * This makes us more fault tolerant when it comes to nodes coming and going
6536 + * (which may be more of an issue if we're hibernating when power supplies
6537 + * are being unreliable).
6538 + *
6539 + * At boot time, we start a ktuxonice thread that handles communication with
6540 + * other nodes. This node maintains a state machine that controls our progress
6541 + * through hibernating and resuming, keeping us in step with other nodes. Nodes
6542 + * are identified by their hw address.
6543 + *
6544 + * On startup, the node sends CLUSTER_PING on the configured interface's
6545 + * broadcast address, port $toi_cluster_port (see below) and begins to listen
6546 + * for other broadcast messages. CLUSTER_PING messages are repeated at
6547 + * intervals of 5 minutes, with a random offset to spread traffic out.
6548 + *
6549 + * A hibernation cycle is initiated from any node via
6550 + *
6551 + * echo > /sys/power/tuxonice/do_hibernate
6552 + *
6553 + * and (possibily) the hibernate script. At each step of the process, the node
6554 + * completes its work, and waits for all other nodes to signal completion of
6555 + * their work (or timeout) before progressing to the next step.
6556 + *
6557 + * Request/state  Action before reply  Possible reply  Next state
6558 + * HIBERNATE     capable, pre-script   HIBERNATE|ACK   NODE_PREP
6559 + *                                     HIBERNATE|NACK  INIT_0
6560 + *
6561 + * PREP                  prepare_image         PREP|ACK        IMAGE_WRITE
6562 + *                                     PREP|NACK       INIT_0
6563 + *                                     ABORT           RUNNING
6564 + *
6565 + * IO            write image           IO|ACK          power off
6566 + *                                     ABORT           POST_RESUME
6567 + *
6568 + * (Boot time)   check for image       IMAGE|ACK       RESUME_PREP
6569 + *                                     (Note 1)
6570 + *                                     IMAGE|NACK      (Note 2)
6571 + *
6572 + * PREP                  prepare read image    PREP|ACK        IMAGE_READ
6573 + *                                     PREP|NACK       (As NACK_IMAGE)
6574 + *
6575 + * IO            read image            IO|ACK          POST_RESUME
6576 + *
6577 + * POST_RESUME   thaw, post-script                     RUNNING
6578 + *
6579 + * INIT_0        init 0
6580 + *
6581 + * Other messages:
6582 + *
6583 + * - PING: Request for all other live nodes to send a PONG. Used at startup to
6584 + *   announce presence, when a node is suspected dead and periodically, in case
6585 + *   segments of the network are [un]plugged.
6586 + *
6587 + * - PONG: Response to a PING.
6588 + *
6589 + * - ABORT: Request to cancel writing an image.
6590 + *
6591 + * - BYE: Notification that this node is shutting down.
6592 + *
6593 + * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
6594 + * nodes which are slower to start up can get state synchronised. If a node
6595 + * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
6596 + * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
6597 + * must invalidate its image (if any) and boot normally.
6598 + *
6599 + * Note 2: May occur when one node lost power or powered off while others
6600 + * hibernated. This node waits for others to complete resuming (ACK_READ)
6601 + * before completing its boot, so that it appears as a fail node restarting.
6602 + *
6603 + * If any node has an image, then it also has a list of nodes that hibernated
6604 + * in synchronisation with it. The node will wait for other nodes to appear
6605 + * or timeout before beginning its restoration.
6606 + *
6607 + * If a node has no image, it needs to wait, in case other nodes which do have
6608 + * an image are going to resume, but are taking longer to announce their
6609 + * presence. For this reason, the user can specify a timeout value and a number
6610 + * of nodes detected before we just continue. (We might want to assume in a
6611 + * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
6612 + * the remaining nodes will too. This might help in situations where some nodes
6613 + * are much slower to boot, or more subject to hardware failures or such like).
6614 + */
6615 +
6616 +#include <linux/suspend.h>
6617 +#include <linux/module.h>
6618 +#include <linux/moduleparam.h>
6619 +#include <linux/if.h>
6620 +#include <linux/rtnetlink.h>
6621 +#include <linux/ip.h>
6622 +#include <linux/udp.h>
6623 +#include <linux/in.h>
6624 +#include <linux/if_arp.h>
6625 +#include <linux/kthread.h>
6626 +#include <linux/wait.h>
6627 +#include <linux/netdevice.h>
6628 +#include <net/ip.h>
6629 +
6630 +#include "tuxonice.h"
6631 +#include "tuxonice_modules.h"
6632 +#include "tuxonice_sysfs.h"
6633 +#include "tuxonice_alloc.h"
6634 +#include "tuxonice_io.h"
6635 +
6636 +#if 1
6637 +#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
6638 +#else
6639 +#define PRINTK(a, b...) do { } while (0)
6640 +#endif
6641 +
6642 +static int loopback_mode;
6643 +static int num_local_nodes = 1;
6644 +#define MAX_LOCAL_NODES 8
6645 +#define SADDR (loopback_mode ? b->sid : h->saddr)
6646 +
6647 +#define MYNAME "TuxOnIce Clustering"
6648 +
6649 +enum cluster_message {
6650 +       MSG_ACK = 1,
6651 +       MSG_NACK = 2,
6652 +       MSG_PING = 4,
6653 +       MSG_ABORT = 8,
6654 +       MSG_BYE = 16,
6655 +       MSG_HIBERNATE = 32,
6656 +       MSG_IMAGE = 64,
6657 +       MSG_IO = 128,
6658 +       MSG_RUNNING = 256
6659 +};
6660 +
6661 +static char *str_message(int message)
6662 +{
6663 +       switch (message) {
6664 +       case 4:
6665 +               return "Ping";
6666 +       case 8:
6667 +               return "Abort";
6668 +       case 9:
6669 +               return "Abort acked";
6670 +       case 10:
6671 +               return "Abort nacked";
6672 +       case 16:
6673 +               return "Bye";
6674 +       case 17:
6675 +               return "Bye acked";
6676 +       case 18:
6677 +               return "Bye nacked";
6678 +       case 32:
6679 +               return "Hibernate request";
6680 +       case 33:
6681 +               return "Hibernate ack";
6682 +       case 34:
6683 +               return "Hibernate nack";
6684 +       case 64:
6685 +               return "Image exists?";
6686 +       case 65:
6687 +               return "Image does exist";
6688 +       case 66:
6689 +               return "No image here";
6690 +       case 128:
6691 +               return "I/O";
6692 +       case 129:
6693 +               return "I/O okay";
6694 +       case 130:
6695 +               return "I/O failed";
6696 +       case 256:
6697 +               return "Running";
6698 +       default:
6699 +               printk("Unrecognised message %d.\n", message);
6700 +               return "Unrecognised message (see dmesg)";
6701 +       }
6702 +}
6703 +
6704 +#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
6705 +#define MSG_STATE_MASK (~MSG_ACK_MASK)
6706 +
6707 +struct node_info {
6708 +       struct list_head member_list;
6709 +       wait_queue_head_t member_events;
6710 +       spinlock_t member_list_lock;
6711 +       spinlock_t receive_lock;
6712 +       int peer_count, ignored_peer_count;
6713 +       struct toi_sysfs_data sysfs_data;
6714 +       enum cluster_message current_message;
6715 +};
6716 +
6717 +struct node_info node_array[MAX_LOCAL_NODES];
6718 +
6719 +struct cluster_member {
6720 +       __be32 addr;
6721 +       enum cluster_message message;
6722 +       struct list_head list;
6723 +       int ignore;
6724 +};
6725 +
6726 +#define toi_cluster_port_send 3501
6727 +#define toi_cluster_port_recv 3502
6728 +
6729 +static struct net_device *net_dev;
6730 +static struct toi_module_ops toi_cluster_ops;
6731 +
6732 +static int toi_recv(struct sk_buff *skb, struct net_device *dev,
6733 +               struct packet_type *pt, struct net_device *orig_dev);
6734 +
6735 +static struct packet_type toi_cluster_packet_type = {
6736 +       .type = __constant_htons(ETH_P_IP),
6737 +       .func = toi_recv,
6738 +};
6739 +
6740 +struct toi_pkt {               /* BOOTP packet format */
6741 +       struct iphdr iph;       /* IP header */
6742 +       struct udphdr udph;     /* UDP header */
6743 +       u8 htype;               /* HW address type */
6744 +       u8 hlen;                /* HW address length */
6745 +       __be32 xid;             /* Transaction ID */
6746 +       __be16 secs;            /* Seconds since we started */
6747 +       __be16 flags;           /* Just what it says */
6748 +       u8 hw_addr[16];         /* Sender's HW address */
6749 +       u16 message;            /* Message */
6750 +       unsigned long sid;      /* Source ID for loopback testing */
6751 +};
6752 +
6753 +static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
6754 +
6755 +static int added_pack;
6756 +
6757 +static int others_have_image;
6758 +
6759 +/* Key used to allow multiple clusters on the same lan */
6760 +static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
6761 +static char pre_hibernate_script[255] =
6762 +       CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
6763 +static char post_hibernate_script[255] =
6764 +       CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
6765 +
6766 +/*                     List of cluster members                 */
6767 +static unsigned long continue_delay = 5 * HZ;
6768 +static unsigned long cluster_message_timeout = 3 * HZ;
6769 +
6770 +/*             === Membership list ===         */
6771 +
6772 +static void print_member_info(int index)
6773 +{
6774 +       struct cluster_member *this;
6775 +
6776 +       printk(KERN_INFO "==> Dumping node %d.\n", index);
6777 +
6778 +       list_for_each_entry(this, &node_array[index].member_list, list)
6779 +               printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
6780 +                               NIPQUAD(this->addr),
6781 +                               str_message(this->message),
6782 +                               this->ignore ? "(Ignored)" : "");
6783 +       printk(KERN_INFO "== Done ==\n");
6784 +}
6785 +
6786 +static struct cluster_member *__find_member(int index, __be32 addr)
6787 +{
6788 +       struct cluster_member *this;
6789 +
6790 +       list_for_each_entry(this, &node_array[index].member_list, list) {
6791 +               if (this->addr != addr)
6792 +                       continue;
6793 +
6794 +               return this;
6795 +       }
6796 +
6797 +       return NULL;
6798 +}
6799 +
6800 +static void set_ignore(int index, __be32 addr, struct cluster_member *this)
6801 +{
6802 +       if (this->ignore) {
6803 +               PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
6804 +                               index, NIPQUAD(addr));
6805 +               return;
6806 +       }
6807 +
6808 +       PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
6809 +                               index, NIPQUAD(addr));
6810 +       this->ignore = 1;
6811 +       node_array[index].ignored_peer_count++;
6812 +}
6813 +
6814 +static int __add_update_member(int index, __be32 addr, int message)
6815 +{
6816 +       struct cluster_member *this;
6817 +
6818 +       this = __find_member(index, addr);
6819 +       if (this) {
6820 +               if (this->message != message) {
6821 +                       this->message = message;
6822 +                       if ((message & MSG_NACK) &&
6823 +                           (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
6824 +                               set_ignore(index, addr, this);
6825 +                       PRINTK("Node %d sees node %d.%d.%d.%d now sending "
6826 +                                       "%s.\n", index, NIPQUAD(addr),
6827 +                                       str_message(message));
6828 +                       wake_up(&node_array[index].member_events);
6829 +               }
6830 +               return 0;
6831 +       }
6832 +
6833 +       this = (struct cluster_member *) toi_kzalloc(36,
6834 +                       sizeof(struct cluster_member), GFP_KERNEL);
6835 +
6836 +       if (!this)
6837 +               return -1;
6838 +
6839 +       this->addr = addr;
6840 +       this->message = message;
6841 +       this->ignore = 0;
6842 +       INIT_LIST_HEAD(&this->list);
6843 +
6844 +       node_array[index].peer_count++;
6845 +
6846 +       PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
6847 +                       NIPQUAD(addr), str_message(message));
6848 +
6849 +       if ((message & MSG_NACK) &&
6850 +           (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
6851 +               set_ignore(index, addr, this);
6852 +       list_add_tail(&this->list, &node_array[index].member_list);
6853 +       return 1;
6854 +}
6855 +
6856 +static int add_update_member(int index, __be32 addr, int message)
6857 +{
6858 +       int result;
6859 +       unsigned long flags;
6860 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
6861 +       result = __add_update_member(index, addr, message);
6862 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
6863 +
6864 +       print_member_info(index);
6865 +
6866 +       wake_up(&node_array[index].member_events);
6867 +
6868 +       return result;
6869 +}
6870 +
6871 +static void del_member(int index, __be32 addr)
6872 +{
6873 +       struct cluster_member *this;
6874 +       unsigned long flags;
6875 +
6876 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
6877 +       this = __find_member(index, addr);
6878 +
6879 +       if (this) {
6880 +               list_del_init(&this->list);
6881 +               toi_kfree(36, this);
6882 +               node_array[index].peer_count--;
6883 +       }
6884 +
6885 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
6886 +}
6887 +
6888 +/*             === Message transmission ===    */
6889 +
6890 +static void toi_send_if(int message, unsigned long my_id);
6891 +
6892 +/*
6893 + *  Process received TOI packet.
6894 + */
6895 +static int toi_recv(struct sk_buff *skb, struct net_device *dev,
6896 +               struct packet_type *pt, struct net_device *orig_dev)
6897 +{
6898 +       struct toi_pkt *b;
6899 +       struct iphdr *h;
6900 +       int len, result, index;
6901 +       unsigned long addr, message, ack;
6902 +
6903 +       /* Perform verifications before taking the lock.  */
6904 +       if (skb->pkt_type == PACKET_OTHERHOST)
6905 +               goto drop;
6906 +
6907 +       if (dev != net_dev)
6908 +               goto drop;
6909 +
6910 +       skb = skb_share_check(skb, GFP_ATOMIC);
6911 +       if (!skb)
6912 +               return NET_RX_DROP;
6913 +
6914 +       if (!pskb_may_pull(skb,
6915 +                          sizeof(struct iphdr) +
6916 +                          sizeof(struct udphdr)))
6917 +               goto drop;
6918 +
6919 +       b = (struct toi_pkt *)skb_network_header(skb);
6920 +       h = &b->iph;
6921 +
6922 +       if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
6923 +               goto drop;
6924 +
6925 +       /* Fragments are not supported */
6926 +       if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
6927 +               if (net_ratelimit())
6928 +                       printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
6929 +                              "cluster message.\n");
6930 +               goto drop;
6931 +       }
6932 +
6933 +       if (skb->len < ntohs(h->tot_len))
6934 +               goto drop;
6935 +
6936 +       if (ip_fast_csum((char *) h, h->ihl))
6937 +               goto drop;
6938 +
6939 +       if (b->udph.source != htons(toi_cluster_port_send) ||
6940 +           b->udph.dest != htons(toi_cluster_port_recv))
6941 +               goto drop;
6942 +
6943 +       if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
6944 +               goto drop;
6945 +
6946 +       len = ntohs(b->udph.len) - sizeof(struct udphdr);
6947 +
6948 +       /* Ok the front looks good, make sure we can get at the rest.  */
6949 +       if (!pskb_may_pull(skb, skb->len))
6950 +               goto drop;
6951 +
6952 +       b = (struct toi_pkt *)skb_network_header(skb);
6953 +       h = &b->iph;
6954 +
6955 +       addr = SADDR;
6956 +       PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
6957 +                       str_message(b->message), NIPQUAD(addr));
6958 +
6959 +       message = b->message & MSG_STATE_MASK;
6960 +       ack = b->message & MSG_ACK_MASK;
6961 +
6962 +       for (index = 0; index < num_local_nodes; index++) {
6963 +               int new_message = node_array[index].current_message,
6964 +                   old_message = new_message;
6965 +
6966 +               if (index == SADDR || !old_message) {
6967 +                       PRINTK("Ignoring node %d (offline or self).\n", index);
6968 +                       continue;
6969 +               }
6970 +
6971 +               /* One message at a time, please. */
6972 +               spin_lock(&node_array[index].receive_lock);
6973 +
6974 +               result = add_update_member(index, SADDR, b->message);
6975 +               if (result == -1) {
6976 +                       printk(KERN_INFO "Failed to add new cluster member "
6977 +                                       NIPQUAD_FMT ".\n",
6978 +                                       NIPQUAD(addr));
6979 +                       goto drop_unlock;
6980 +               }
6981 +
6982 +               switch (b->message & MSG_STATE_MASK) {
6983 +               case MSG_PING:
6984 +                       break;
6985 +               case MSG_ABORT:
6986 +                       break;
6987 +               case MSG_BYE:
6988 +                       break;
6989 +               case MSG_HIBERNATE:
6990 +                       /* Can I hibernate? */
6991 +                       new_message = MSG_HIBERNATE |
6992 +                               ((index & 1) ? MSG_NACK : MSG_ACK);
6993 +                       break;
6994 +               case MSG_IMAGE:
6995 +                       /* Can I resume? */
6996 +                       new_message = MSG_IMAGE |
6997 +                               ((index & 1) ? MSG_NACK : MSG_ACK);
6998 +                       if (new_message != old_message)
6999 +                               printk("Setting whether I can resume to %d.\n",
7000 +                                               new_message);
7001 +                       break;
7002 +               case MSG_IO:
7003 +                       new_message = MSG_IO | MSG_ACK;
7004 +                       break;
7005 +               case MSG_RUNNING:
7006 +                       break;
7007 +               default:
7008 +                       if (net_ratelimit())
7009 +                               printk(KERN_ERR "Unrecognised TuxOnIce cluster"
7010 +                                       " message %d from " NIPQUAD_FMT ".\n",
7011 +                                       b->message, NIPQUAD(addr));
7012 +               };
7013 +
7014 +               if (old_message != new_message) {
7015 +                       node_array[index].current_message = new_message;
7016 +                       printk(KERN_INFO ">>> Sending new message for node "
7017 +                                       "%d.\n", index);
7018 +                       toi_send_if(new_message, index);
7019 +               } else if (!ack) {
7020 +                       printk(KERN_INFO ">>> Resending message for node %d.\n",
7021 +                                       index);
7022 +                       toi_send_if(new_message, index);
7023 +               }
7024 +drop_unlock:
7025 +               spin_unlock(&node_array[index].receive_lock);
7026 +       };
7027 +
7028 +drop:
7029 +       /* Throw the packet out. */
7030 +       kfree_skb(skb);
7031 +
7032 +       return 0;
7033 +}
7034 +
7035 +/*
7036 + *  Send cluster message to single interface.
7037 + */
7038 +static void toi_send_if(int message, unsigned long my_id)
7039 +{
7040 +       struct sk_buff *skb;
7041 +       struct toi_pkt *b;
7042 +       int hh_len = LL_RESERVED_SPACE(net_dev);
7043 +       struct iphdr *h;
7044 +
7045 +       /* Allocate packet */
7046 +       skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
7047 +       if (!skb)
7048 +               return;
7049 +       skb_reserve(skb, hh_len);
7050 +       b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
7051 +       memset(b, 0, sizeof(struct toi_pkt));
7052 +
7053 +       /* Construct IP header */
7054 +       skb_reset_network_header(skb);
7055 +       h = ip_hdr(skb);
7056 +       h->version = 4;
7057 +       h->ihl = 5;
7058 +       h->tot_len = htons(sizeof(struct toi_pkt));
7059 +       h->frag_off = htons(IP_DF);
7060 +       h->ttl = 64;
7061 +       h->protocol = IPPROTO_UDP;
7062 +       h->daddr = htonl(INADDR_BROADCAST);
7063 +       h->check = ip_fast_csum((unsigned char *) h, h->ihl);
7064 +
7065 +       /* Construct UDP header */
7066 +       b->udph.source = htons(toi_cluster_port_send);
7067 +       b->udph.dest = htons(toi_cluster_port_recv);
7068 +       b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
7069 +       /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
7070 +
7071 +       /* Construct message */
7072 +       b->message = message;
7073 +       b->sid = my_id;
7074 +       b->htype = net_dev->type; /* can cause undefined behavior */
7075 +       b->hlen = net_dev->addr_len;
7076 +       memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
7077 +       b->secs = htons(3); /* 3 seconds */
7078 +
7079 +       /* Chain packet down the line... */
7080 +       skb->dev = net_dev;
7081 +       skb->protocol = htons(ETH_P_IP);
7082 +       if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
7083 +                    net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
7084 +                       dev_queue_xmit(skb) < 0)
7085 +               printk(KERN_INFO "E");
7086 +}
7087 +
7088 +/*     =========================================               */
7089 +
7090 +/*                     kTOICluster                     */
7091 +
7092 +static atomic_t num_cluster_threads;
7093 +static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
7094 +
7095 +static int kTOICluster(void *data)
7096 +{
7097 +       unsigned long my_id;
7098 +
7099 +       my_id = atomic_add_return(1, &num_cluster_threads) - 1;
7100 +       node_array[my_id].current_message = (unsigned long) data;
7101 +
7102 +       PRINTK("kTOICluster daemon %lu starting.\n", my_id);
7103 +
7104 +       current->flags |= PF_NOFREEZE;
7105 +
7106 +       while (node_array[my_id].current_message) {
7107 +               toi_send_if(node_array[my_id].current_message, my_id);
7108 +               sleep_on_timeout(&clusterd_events,
7109 +                               cluster_message_timeout);
7110 +               PRINTK("Link state %lu is %d.\n", my_id,
7111 +                               node_array[my_id].current_message);
7112 +       }
7113 +
7114 +       toi_send_if(MSG_BYE, my_id);
7115 +       atomic_dec(&num_cluster_threads);
7116 +       wake_up(&clusterd_events);
7117 +
7118 +       PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
7119 +       __set_current_state(TASK_RUNNING);
7120 +       return 0;
7121 +}
7122 +
7123 +static void kill_clusterd(void)
7124 +{
7125 +       int i;
7126 +
7127 +       for (i = 0; i < num_local_nodes; i++) {
7128 +               if (node_array[i].current_message) {
7129 +                       PRINTK("Seeking to kill clusterd %d.\n", i);
7130 +                       node_array[i].current_message = 0;
7131 +               }
7132 +       }
7133 +       wait_event(clusterd_events,
7134 +                       !atomic_read(&num_cluster_threads));
7135 +       PRINTK("All cluster daemons have exited.\n");
7136 +}
7137 +
7138 +static int peers_not_in_message(int index, int message, int precise)
7139 +{
7140 +       struct cluster_member *this;
7141 +       unsigned long flags;
7142 +       int result = 0;
7143 +
7144 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
7145 +       list_for_each_entry(this, &node_array[index].member_list, list) {
7146 +               if (this->ignore)
7147 +                       continue;
7148 +
7149 +               PRINTK("Peer %d.%d.%d.%d sending %s. "
7150 +                       "Seeking %s.\n",
7151 +                       NIPQUAD(this->addr),
7152 +                       str_message(this->message), str_message(message));
7153 +               if ((precise ? this->message :
7154 +                                       this->message & MSG_STATE_MASK) !=
7155 +                                       message)
7156 +                       result++;
7157 +       }
7158 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
7159 +       PRINTK("%d peers in sought message.\n", result);
7160 +       return result;
7161 +}
7162 +
7163 +static void reset_ignored(int index)
7164 +{
7165 +       struct cluster_member *this;
7166 +       unsigned long flags;
7167 +
7168 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
7169 +       list_for_each_entry(this, &node_array[index].member_list, list)
7170 +               this->ignore = 0;
7171 +       node_array[index].ignored_peer_count = 0;
7172 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
7173 +}
7174 +
7175 +static int peers_in_message(int index, int message, int precise)
7176 +{
7177 +       return node_array[index].peer_count -
7178 +               node_array[index].ignored_peer_count -
7179 +               peers_not_in_message(index, message, precise);
7180 +}
7181 +
7182 +static int time_to_continue(int index, unsigned long start, int message)
7183 +{
7184 +       int first = peers_not_in_message(index, message, 0);
7185 +       int second = peers_in_message(index, message, 1);
7186 +
7187 +       PRINTK("First part returns %d, second returns %d.\n", first, second);
7188 +
7189 +       if (!first && !second) {
7190 +               PRINTK("All peers answered message %d.\n",
7191 +                       message);
7192 +               return 1;
7193 +       }
7194 +
7195 +       if (time_after(jiffies, start + continue_delay)) {
7196 +               PRINTK("Timeout reached.\n");
7197 +               return 1;
7198 +       }
7199 +
7200 +       PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
7201 +                       start + continue_delay);
7202 +       return 0;
7203 +}
7204 +
7205 +void toi_initiate_cluster_hibernate(void)
7206 +{
7207 +       int result;
7208 +       unsigned long start;
7209 +
7210 +       result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
7211 +       if (result)
7212 +               return;
7213 +
7214 +       toi_send_if(MSG_HIBERNATE, 0);
7215 +
7216 +       start = jiffies;
7217 +       wait_event(node_array[0].member_events,
7218 +                       time_to_continue(0, start, MSG_HIBERNATE));
7219 +
7220 +       if (test_action_state(TOI_FREEZER_TEST)) {
7221 +               toi_send_if(MSG_ABORT, 0);
7222 +
7223 +               start = jiffies;
7224 +               wait_event(node_array[0].member_events,
7225 +                       time_to_continue(0, start, MSG_RUNNING));
7226 +
7227 +               do_toi_step(STEP_QUIET_CLEANUP);
7228 +               return;
7229 +       }
7230 +
7231 +       toi_send_if(MSG_IO, 0);
7232 +
7233 +       result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
7234 +       if (result)
7235 +               return;
7236 +
7237 +       /* This code runs at resume time too! */
7238 +       if (toi_in_hibernate)
7239 +               result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
7240 +}
7241 +EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate);
7242 +
7243 +/* toi_cluster_print_debug_stats
7244 + *
7245 + * Description:        Print information to be recorded for debugging purposes into a
7246 + *             buffer.
7247 + * Arguments:  buffer: Pointer to a buffer into which the debug info will be
7248 + *                     printed.
7249 + *             size:   Size of the buffer.
7250 + * Returns:    Number of characters written to the buffer.
7251 + */
7252 +static int toi_cluster_print_debug_stats(char *buffer, int size)
7253 +{
7254 +       int len;
7255 +
7256 +       if (strlen(toi_cluster_iface))
7257 +               len = snprintf_used(buffer, size,
7258 +                               "- Cluster interface is '%s'.\n",
7259 +                               toi_cluster_iface);
7260 +       else
7261 +               len = snprintf_used(buffer, size,
7262 +                               "- Cluster support is disabled.\n");
7263 +       return len;
7264 +}
7265 +
7266 +/* cluster_memory_needed
7267 + *
7268 + * Description:        Tell the caller how much memory we need to operate during
7269 + *             hibernate/resume.
7270 + * Returns:    Unsigned long. Maximum number of bytes of memory required for
7271 + *             operation.
7272 + */
7273 +static int toi_cluster_memory_needed(void)
7274 +{
7275 +       return 0;
7276 +}
7277 +
7278 +static int toi_cluster_storage_needed(void)
7279 +{
7280 +       return 1 + strlen(toi_cluster_iface);
7281 +}
7282 +
7283 +/* toi_cluster_save_config_info
7284 + *
7285 + * Description:        Save informaton needed when reloading the image at resume time.
7286 + * Arguments:  Buffer:         Pointer to a buffer of size PAGE_SIZE.
7287 + * Returns:    Number of bytes used for saving our data.
7288 + */
7289 +static int toi_cluster_save_config_info(char *buffer)
7290 +{
7291 +       strcpy(buffer, toi_cluster_iface);
7292 +       return strlen(toi_cluster_iface + 1);
7293 +}
7294 +
7295 +/* toi_cluster_load_config_info
7296 + *
7297 + * Description:        Reload information needed for declustering the image at
7298 + *             resume time.
7299 + * Arguments:  Buffer:         Pointer to the start of the data.
7300 + *             Size:           Number of bytes that were saved.
7301 + */
7302 +static void toi_cluster_load_config_info(char *buffer, int size)
7303 +{
7304 +       strncpy(toi_cluster_iface, buffer, size);
7305 +       return;
7306 +}
7307 +
7308 +static void cluster_startup(void)
7309 +{
7310 +       int have_image = do_check_can_resume(), i;
7311 +       unsigned long start = jiffies, initial_message;
7312 +       struct task_struct *p;
7313 +
7314 +       initial_message = MSG_IMAGE;
7315 +
7316 +       have_image = 1;
7317 +
7318 +       for (i = 0; i < num_local_nodes; i++) {
7319 +               PRINTK("Starting ktoiclusterd %d.\n", i);
7320 +               p = kthread_create(kTOICluster, (void *) initial_message,
7321 +                               "ktoiclusterd/%d", i);
7322 +               if (IS_ERR(p)) {
7323 +                       printk("Failed to start ktoiclusterd.\n");
7324 +                       return;
7325 +               }
7326 +
7327 +               wake_up_process(p);
7328 +       }
7329 +
7330 +       /* Wait for delay or someone else sending first message */
7331 +       wait_event(node_array[0].member_events, time_to_continue(0, start,
7332 +                               MSG_IMAGE));
7333 +
7334 +       others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
7335 +
7336 +       printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
7337 +               " %d.\n", have_image ? "" : "don't ", others_have_image);
7338 +
7339 +       if (have_image) {
7340 +               int result;
7341 +
7342 +               /* Start to resume */
7343 +               printk(KERN_INFO "  === Starting to resume ===  \n");
7344 +               node_array[0].current_message = MSG_IO;
7345 +               toi_send_if(MSG_IO, 0);
7346 +
7347 +               /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
7348 +               result = 0;
7349 +
7350 +               if (!result) {
7351 +                       /*
7352 +                        * Atomic restore - we'll come back in the hibernation
7353 +                        * path.
7354 +                        */
7355 +
7356 +                       /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
7357 +                       result = 0;
7358 +
7359 +                       /* do_toi_step(STEP_QUIET_CLEANUP); */
7360 +               }
7361 +
7362 +               node_array[0].current_message |= MSG_NACK;
7363 +
7364 +               /* For debugging - disable for real life? */
7365 +               wait_event(node_array[0].member_events,
7366 +                               time_to_continue(0, start, MSG_IO));
7367 +       }
7368 +
7369 +       if (others_have_image) {
7370 +               /* Wait for them to resume */
7371 +               printk(KERN_INFO "Waiting for other nodes to resume.\n");
7372 +               start = jiffies;
7373 +               wait_event(node_array[0].member_events,
7374 +                               time_to_continue(0, start, MSG_RUNNING));
7375 +               if (peers_not_in_message(0, MSG_RUNNING, 0))
7376 +                       printk(KERN_INFO "Timed out while waiting for other "
7377 +                                       "nodes to resume.\n");
7378 +       }
7379 +
7380 +       /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
7381 +        * as appropriate.
7382 +        *
7383 +        * If we don't have an image:
7384 +        * - Wait until someone else says they have one, or conditions are met
7385 +        *   for continuing to boot (n machines or t seconds).
7386 +        * - If anyone has an image, wait for them to resume before continuing
7387 +        *   to boot.
7388 +        *
7389 +        * If we have an image:
7390 +        * - Wait until conditions are met before continuing to resume (n
7391 +        *   machines or t seconds). Send RESUME_PREP and freeze processes.
7392 +        *   NACK_PREP if freezing fails (shouldn't) and follow logic for
7393 +        *   us having no image above. On success, wait for [N]ACK_PREP from
7394 +        *   other machines. Read image (including atomic restore) until done.
7395 +        *   Wait for ACK_READ from others (should never fail). Thaw processes
7396 +        *   and do post-resume. (The section after the atomic restore is done
7397 +        *   via the code for hibernating).
7398 +        */
7399 +
7400 +       node_array[0].current_message = MSG_RUNNING;
7401 +}
7402 +
7403 +/* toi_cluster_open_iface
7404 + *
7405 + * Description:        Prepare to use an interface.
7406 + */
7407 +
7408 +static int toi_cluster_open_iface(void)
7409 +{
7410 +       struct net_device *dev;
7411 +
7412 +       rtnl_lock();
7413 +
7414 +       for_each_netdev(&init_net, dev) {
7415 +               if (/* dev == &init_net.loopback_dev || */
7416 +                   strcmp(dev->name, toi_cluster_iface))
7417 +                       continue;
7418 +
7419 +               net_dev = dev;
7420 +               break;
7421 +       }
7422 +
7423 +       rtnl_unlock();
7424 +
7425 +       if (!net_dev) {
7426 +               printk(KERN_ERR MYNAME ": Device %s not found.\n",
7427 +                               toi_cluster_iface);
7428 +               return -ENODEV;
7429 +       }
7430 +
7431 +       dev_add_pack(&toi_cluster_packet_type);
7432 +       added_pack = 1;
7433 +
7434 +       loopback_mode = (net_dev == init_net.loopback_dev);
7435 +       num_local_nodes = loopback_mode ? 8 : 1;
7436 +
7437 +       PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
7438 +                       loopback_mode ? "on" : "off", num_local_nodes);
7439 +
7440 +       cluster_startup();
7441 +       return 0;
7442 +}
7443 +
7444 +/* toi_cluster_close_iface
7445 + *
7446 + * Description: Stop using an interface.
7447 + */
7448 +
7449 +static int toi_cluster_close_iface(void)
7450 +{
7451 +       kill_clusterd();
7452 +       if (added_pack) {
7453 +               dev_remove_pack(&toi_cluster_packet_type);
7454 +               added_pack = 0;
7455 +       }
7456 +       return 0;
7457 +}
7458 +
7459 +static void write_side_effect(void)
7460 +{
7461 +       if (toi_cluster_ops.enabled) {
7462 +               toi_cluster_open_iface();
7463 +               set_toi_state(TOI_CLUSTER_MODE);
7464 +       } else {
7465 +               toi_cluster_close_iface();
7466 +               clear_toi_state(TOI_CLUSTER_MODE);
7467 +       }
7468 +}
7469 +
7470 +static void node_write_side_effect(void)
7471 +{
7472 +}
7473 +
7474 +/*
7475 + * data for our sysfs entries.
7476 + */
7477 +static struct toi_sysfs_data sysfs_params[] = {
7478 +       {
7479 +               TOI_ATTR("interface", SYSFS_RW),
7480 +               SYSFS_STRING(toi_cluster_iface, IFNAMSIZ, 0)
7481 +       },
7482 +
7483 +       {
7484 +               TOI_ATTR("enabled", SYSFS_RW),
7485 +               SYSFS_INT(&toi_cluster_ops.enabled, 0, 1, 0),
7486 +               .write_side_effect = write_side_effect,
7487 +       },
7488 +
7489 +       {
7490 +               TOI_ATTR("cluster_name", SYSFS_RW),
7491 +               SYSFS_STRING(toi_cluster_key, 32, 0)
7492 +       },
7493 +
7494 +       {
7495 +               TOI_ATTR("pre-hibernate-script", SYSFS_RW),
7496 +               SYSFS_STRING(pre_hibernate_script, 256, 0)
7497 +       },
7498 +
7499 +       {
7500 +               TOI_ATTR("post-hibernate-script", SYSFS_RW),
7501 +               SYSFS_STRING(post_hibernate_script, 256, 0)
7502 +       },
7503 +
7504 +       {
7505 +               TOI_ATTR("continue_delay", SYSFS_RW),
7506 +               SYSFS_UL(&continue_delay, HZ / 2, 60 * HZ, 0)
7507 +       }
7508 +};
7509 +
7510 +/*
7511 + * Ops structure.
7512 + */
7513 +
7514 +static struct toi_module_ops toi_cluster_ops = {
7515 +       .type                   = FILTER_MODULE,
7516 +       .name                   = "Cluster",
7517 +       .directory              = "cluster",
7518 +       .module                 = THIS_MODULE,
7519 +       .memory_needed          = toi_cluster_memory_needed,
7520 +       .print_debug_info       = toi_cluster_print_debug_stats,
7521 +       .save_config_info       = toi_cluster_save_config_info,
7522 +       .load_config_info       = toi_cluster_load_config_info,
7523 +       .storage_needed         = toi_cluster_storage_needed,
7524 +
7525 +       .sysfs_data             = sysfs_params,
7526 +       .num_sysfs_entries      = sizeof(sysfs_params) /
7527 +               sizeof(struct toi_sysfs_data),
7528 +};
7529 +
7530 +/* ---- Registration ---- */
7531 +
7532 +#ifdef MODULE
7533 +#define INIT static __init
7534 +#define EXIT static __exit
7535 +#else
7536 +#define INIT
7537 +#define EXIT
7538 +#endif
7539 +
7540 +INIT int toi_cluster_init(void)
7541 +{
7542 +       int temp = toi_register_module(&toi_cluster_ops), i;
7543 +       struct kobject *kobj = toi_cluster_ops.dir_kobj;
7544 +
7545 +       for (i = 0; i < MAX_LOCAL_NODES; i++) {
7546 +               node_array[i].current_message = 0;
7547 +               INIT_LIST_HEAD(&node_array[i].member_list);
7548 +               init_waitqueue_head(&node_array[i].member_events);
7549 +               spin_lock_init(&node_array[i].member_list_lock);
7550 +               spin_lock_init(&node_array[i].receive_lock);
7551 +
7552 +               /* Set up sysfs entry */
7553 +               node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
7554 +                               sizeof(node_array[i].sysfs_data.attr.name),
7555 +                               GFP_KERNEL);
7556 +               sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
7557 +                               i);
7558 +               node_array[i].sysfs_data.attr.mode = SYSFS_RW;
7559 +               node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
7560 +               node_array[i].sysfs_data.flags = 0;
7561 +               node_array[i].sysfs_data.data.integer.variable =
7562 +                       (int *) &node_array[i].current_message;
7563 +               node_array[i].sysfs_data.data.integer.minimum = 0;
7564 +               node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
7565 +               node_array[i].sysfs_data.write_side_effect =
7566 +                       node_write_side_effect;
7567 +               toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
7568 +       }
7569 +
7570 +       toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
7571 +
7572 +       if (toi_cluster_ops.enabled)
7573 +               toi_cluster_open_iface();
7574 +
7575 +       return temp;
7576 +}
7577 +
7578 +EXIT void toi_cluster_exit(void)
7579 +{
7580 +       int i;
7581 +       toi_cluster_close_iface();
7582 +
7583 +       for (i = 0; i < MAX_LOCAL_NODES; i++)
7584 +               toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
7585 +                               &node_array[i].sysfs_data);
7586 +       toi_unregister_module(&toi_cluster_ops);
7587 +}
7588 +
7589 +static int __init toi_cluster_iface_setup(char *iface)
7590 +{
7591 +       toi_cluster_ops.enabled = (*iface &&
7592 +                       strcmp(iface, "off"));
7593 +
7594 +       if (toi_cluster_ops.enabled)
7595 +               strncpy(toi_cluster_iface, iface, strlen(iface));
7596 +}
7597 +
7598 +__setup("toi_cluster=", toi_cluster_iface_setup);
7599 +
7600 +#ifdef MODULE
7601 +MODULE_LICENSE("GPL");
7602 +module_init(toi_cluster_init);
7603 +module_exit(toi_cluster_exit);
7604 +MODULE_AUTHOR("Nigel Cunningham");
7605 +MODULE_DESCRIPTION("Cluster Support for TuxOnIce");
7606 +#endif
7607 diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
7608 new file mode 100644
7609 index 0000000..cd9ee3a
7610 --- /dev/null
7611 +++ b/kernel/power/tuxonice_cluster.h
7612 @@ -0,0 +1,19 @@
7613 +/*
7614 + * kernel/power/tuxonice_cluster.h
7615 + *
7616 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
7617 + * Copyright (C) 2006 Red Hat, inc.
7618 + *
7619 + * This file is released under the GPLv2.
7620 + */
7621 +
7622 +#ifdef CONFIG_TOI_CLUSTER
7623 +extern int toi_cluster_init(void);
7624 +extern void toi_cluster_exit(void);
7625 +extern void toi_initiate_cluster_hibernate(void);
7626 +#else
7627 +static inline int toi_cluster_init(void) { return 0; }
7628 +static inline void toi_cluster_exit(void) { }
7629 +static inline void toi_initiate_cluster_hibernate(void) { }
7630 +#endif
7631 +
7632 diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
7633 new file mode 100644
7634 index 0000000..9925485
7635 --- /dev/null
7636 +++ b/kernel/power/tuxonice_compress.c
7637 @@ -0,0 +1,436 @@
7638 +/*
7639 + * kernel/power/compression.c
7640 + *
7641 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
7642 + *
7643 + * This file is released under the GPLv2.
7644 + *
7645 + * This file contains data compression routines for TuxOnIce,
7646 + * using cryptoapi.
7647 + */
7648 +
7649 +#include <linux/module.h>
7650 +#include <linux/suspend.h>
7651 +#include <linux/highmem.h>
7652 +#include <linux/vmalloc.h>
7653 +#include <linux/crypto.h>
7654 +
7655 +#include "tuxonice_builtin.h"
7656 +#include "tuxonice.h"
7657 +#include "tuxonice_modules.h"
7658 +#include "tuxonice_sysfs.h"
7659 +#include "tuxonice_io.h"
7660 +#include "tuxonice_ui.h"
7661 +#include "tuxonice_alloc.h"
7662 +
7663 +static int toi_expected_compression;
7664 +
7665 +static struct toi_module_ops toi_compression_ops;
7666 +static struct toi_module_ops *next_driver;
7667 +
7668 +static char toi_compressor_name[32] = "lzf";
7669 +
7670 +static DEFINE_MUTEX(stats_lock);
7671 +
7672 +struct cpu_context {
7673 +       u8 *page_buffer;
7674 +       struct crypto_comp *transform;
7675 +       unsigned int len;
7676 +       char *buffer_start;
7677 +};
7678 +
7679 +static DEFINE_PER_CPU(struct cpu_context, contexts);
7680 +
7681 +static int toi_compress_prepare_result;
7682 +
7683 +/*
7684 + * toi_compress_cleanup
7685 + *
7686 + * Frees memory allocated for our labours.
7687 + */
7688 +static void toi_compress_cleanup(int toi_or_resume)
7689 +{
7690 +       int cpu;
7691 +
7692 +       if (!toi_or_resume)
7693 +               return;
7694 +
7695 +       for_each_online_cpu(cpu) {
7696 +               struct cpu_context *this = &per_cpu(contexts, cpu);
7697 +               if (this->transform) {
7698 +                       crypto_free_comp(this->transform);
7699 +                       this->transform = NULL;
7700 +               }
7701 +
7702 +               if (this->page_buffer)
7703 +                       toi_free_page(16, (unsigned long) this->page_buffer);
7704 +
7705 +               this->page_buffer = NULL;
7706 +       }
7707 +}
7708 +
7709 +/*
7710 + * toi_crypto_prepare
7711 + *
7712 + * Prepare to do some work by allocating buffers and transforms.
7713 + */
7714 +static int toi_compress_crypto_prepare(void)
7715 +{
7716 +       int cpu;
7717 +
7718 +       if (!*toi_compressor_name) {
7719 +               printk(KERN_INFO "TuxOnIce: Compression enabled but no "
7720 +                               "compressor name set.\n");
7721 +               return 1;
7722 +       }
7723 +
7724 +       for_each_online_cpu(cpu) {
7725 +               struct cpu_context *this = &per_cpu(contexts, cpu);
7726 +               this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
7727 +               if (IS_ERR(this->transform)) {
7728 +                       printk(KERN_INFO "TuxOnIce: Failed to initialise the "
7729 +                                       "%s compression transform.\n",
7730 +                                       toi_compressor_name);
7731 +                       this->transform = NULL;
7732 +                       return 1;
7733 +               }
7734 +
7735 +               this->page_buffer =
7736 +                       (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
7737 +
7738 +               if (!this->page_buffer) {
7739 +                       printk(KERN_ERR
7740 +                         "Failed to allocate a page buffer for TuxOnIce "
7741 +                         "encryption driver.\n");
7742 +                       return -ENOMEM;
7743 +               }
7744 +       }
7745 +
7746 +       return 0;
7747 +}
7748 +
7749 +/*
7750 + * toi_compress_init
7751 + */
7752 +
7753 +static int toi_compress_init(int toi_or_resume)
7754 +{
7755 +       if (!toi_or_resume)
7756 +               return 0;
7757 +
7758 +       toi_compress_bytes_in = toi_compress_bytes_out = 0;
7759 +
7760 +       next_driver = toi_get_next_filter(&toi_compression_ops);
7761 +
7762 +       if (!next_driver)
7763 +               return -ECHILD;
7764 +
7765 +       toi_compress_prepare_result = toi_compress_crypto_prepare();
7766 +
7767 +       return 0;
7768 +}
7769 +
7770 +/*
7771 + * toi_compress_rw_init()
7772 + */
7773 +
7774 +int toi_compress_rw_init(int rw, int stream_number)
7775 +{
7776 +       if (toi_compress_prepare_result) {
7777 +               printk("Failed to initialise compression algorithm.\n");
7778 +               if (rw == READ)
7779 +                       return -ENODEV;
7780 +               else
7781 +                       toi_compression_ops.enabled = 0;
7782 +       }
7783 +
7784 +       return 0;
7785 +}
7786 +
7787 +/*
7788 + * toi_compress_write_page()
7789 + *
7790 + * Compress a page of data, buffering output and passing on filled
7791 + * pages to the next module in the pipeline.
7792 + *
7793 + * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
7794 + * data to be compressed.
7795 + *
7796 + * Returns:    0 on success. Otherwise the error is that returned by later
7797 + *             modules, -ECHILD if we have a broken pipeline or -EIO if
7798 + *             zlib errs.
7799 + */
7800 +static int toi_compress_write_page(unsigned long index,
7801 +               struct page *buffer_page, unsigned int buf_size)
7802 +{
7803 +       int ret, cpu = smp_processor_id();
7804 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
7805 +
7806 +       if (!ctx->transform)
7807 +               return next_driver->write_page(index, buffer_page, buf_size);
7808 +
7809 +       ctx->buffer_start = kmap(buffer_page);
7810 +
7811 +       ctx->len = buf_size;
7812 +
7813 +       ret = crypto_comp_compress(ctx->transform,
7814 +                       ctx->buffer_start, buf_size,
7815 +                       ctx->page_buffer, &ctx->len);
7816 +
7817 +       kunmap(buffer_page);
7818 +
7819 +       if (ret) {
7820 +               printk(KERN_INFO "Compression failed.\n");
7821 +               goto failure;
7822 +       }
7823 +
7824 +       mutex_lock(&stats_lock);
7825 +       toi_compress_bytes_in += buf_size;
7826 +       toi_compress_bytes_out += ctx->len;
7827 +       mutex_unlock(&stats_lock);
7828 +
7829 +       if (ctx->len < buf_size) /* some compression */
7830 +               ret = next_driver->write_page(index,
7831 +                               virt_to_page(ctx->page_buffer),
7832 +                               ctx->len);
7833 +       else
7834 +               ret = next_driver->write_page(index, buffer_page, buf_size);
7835 +
7836 +failure:
7837 +       return ret;
7838 +}
7839 +
7840 +/*
7841 + * toi_compress_read_page()
7842 + * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
7843 + *
7844 + * Retrieve data from later modules and decompress it until the input buffer
7845 + * is filled.
7846 + * Zero if successful. Error condition from me or from downstream on failure.
7847 + */
7848 +static int toi_compress_read_page(unsigned long *index,
7849 +               struct page *buffer_page, unsigned int *buf_size)
7850 +{
7851 +       int ret, cpu = smp_processor_id();
7852 +       unsigned int len;
7853 +       unsigned int outlen = PAGE_SIZE;
7854 +       char *buffer_start;
7855 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
7856 +
7857 +       if (!ctx->transform)
7858 +               return next_driver->read_page(index, buffer_page, buf_size);
7859 +
7860 +       /*
7861 +        * All our reads must be synchronous - we can't decompress
7862 +        * data that hasn't been read yet.
7863 +        */
7864 +
7865 +       *buf_size = PAGE_SIZE;
7866 +
7867 +       ret = next_driver->read_page(index, buffer_page, &len);
7868 +
7869 +       /* Error or uncompressed data */
7870 +       if (ret || len == PAGE_SIZE)
7871 +               return ret;
7872 +
7873 +       buffer_start = kmap(buffer_page);
7874 +       memcpy(ctx->page_buffer, buffer_start, len);
7875 +       ret = crypto_comp_decompress(
7876 +                       ctx->transform,
7877 +                       ctx->page_buffer,
7878 +                       len, buffer_start, &outlen);
7879 +       if (ret)
7880 +               abort_hibernate(TOI_FAILED_IO,
7881 +                       "Compress_read returned %d.\n", ret);
7882 +       else if (outlen != PAGE_SIZE) {
7883 +               abort_hibernate(TOI_FAILED_IO,
7884 +                       "Decompression yielded %d bytes instead of %ld.\n",
7885 +                       outlen, PAGE_SIZE);
7886 +               printk("Decompression yielded %d bytes instead of %ld.\n",
7887 +                       outlen, PAGE_SIZE);
7888 +               ret = -EIO;
7889 +               *buf_size = outlen;
7890 +       }
7891 +       kunmap(buffer_page);
7892 +       return ret;
7893 +}
7894 +
7895 +/*
7896 + * toi_compress_print_debug_stats
7897 + * @buffer: Pointer to a buffer into which the debug info will be printed.
7898 + * @size: Size of the buffer.
7899 + *
7900 + * Print information to be recorded for debugging purposes into a buffer.
7901 + * Returns: Number of characters written to the buffer.
7902 + */
7903 +
7904 +static int toi_compress_print_debug_stats(char *buffer, int size)
7905 +{
7906 +       unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
7907 +                     pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
7908 +       int len;
7909 +
7910 +       /* Output the compression ratio achieved. */
7911 +       if (*toi_compressor_name)
7912 +               len = snprintf_used(buffer, size, "- Compressor is '%s'.\n",
7913 +                               toi_compressor_name);
7914 +       else
7915 +               len = snprintf_used(buffer, size, "- Compressor is not set.\n");
7916 +
7917 +       if (pages_in)
7918 +               len += snprintf_used(buffer+len, size - len,
7919 +                 "  Compressed %lu bytes into %lu (%d percent compression).\n",
7920 +                 toi_compress_bytes_in,
7921 +                 toi_compress_bytes_out,
7922 +                 (pages_in - pages_out) * 100 / pages_in);
7923 +       return len;
7924 +}
7925 +
7926 +/*
7927 + * toi_compress_compression_memory_needed
7928 + *
7929 + * Tell the caller how much memory we need to operate during hibernate/resume.
7930 + * Returns: Unsigned long. Maximum number of bytes of memory required for
7931 + * operation.
7932 + */
7933 +static int toi_compress_memory_needed(void)
7934 +{
7935 +       return 2 * PAGE_SIZE;
7936 +}
7937 +
7938 +static int toi_compress_storage_needed(void)
7939 +{
7940 +       return 4 * sizeof(unsigned long) + strlen(toi_compressor_name) + 1;
7941 +}
7942 +
7943 +/*
7944 + * toi_compress_save_config_info
7945 + * @buffer: Pointer to a buffer of size PAGE_SIZE.
7946 + *
7947 + * Save informaton needed when reloading the image at resume time.
7948 + * Returns: Number of bytes used for saving our data.
7949 + */
7950 +static int toi_compress_save_config_info(char *buffer)
7951 +{
7952 +       int namelen = strlen(toi_compressor_name) + 1;
7953 +       int total_len;
7954 +
7955 +       *((unsigned long *) buffer) = toi_compress_bytes_in;
7956 +       *((unsigned long *) (buffer + 1 * sizeof(unsigned long))) =
7957 +               toi_compress_bytes_out;
7958 +       *((unsigned long *) (buffer + 2 * sizeof(unsigned long))) =
7959 +               toi_expected_compression;
7960 +       *((unsigned long *) (buffer + 3 * sizeof(unsigned long))) = namelen;
7961 +       strncpy(buffer + 4 * sizeof(unsigned long), toi_compressor_name,
7962 +                                                               namelen);
7963 +       total_len = 4 * sizeof(unsigned long) + namelen;
7964 +       return total_len;
7965 +}
7966 +
7967 +/* toi_compress_load_config_info
7968 + * @buffer: Pointer to the start of the data.
7969 + * @size: Number of bytes that were saved.
7970 + *
7971 + * Description:        Reload information needed for decompressing the image at
7972 + * resume time.
7973 + */
7974 +static void toi_compress_load_config_info(char *buffer, int size)
7975 +{
7976 +       int namelen;
7977 +
7978 +       toi_compress_bytes_in = *((unsigned long *) buffer);
7979 +       toi_compress_bytes_out = *((unsigned long *) (buffer + 1 *
7980 +                               sizeof(unsigned long)));
7981 +       toi_expected_compression = *((unsigned long *) (buffer + 2 *
7982 +                               sizeof(unsigned long)));
7983 +       namelen = *((unsigned long *) (buffer + 3 * sizeof(unsigned long)));
7984 +       strncpy(toi_compressor_name, buffer + 4 * sizeof(unsigned long),
7985 +                       namelen);
7986 +       return;
7987 +}
7988 +
7989 +/*
7990 + * toi_expected_compression_ratio
7991 + *
7992 + * Description:        Returns the expected ratio between data passed into this module
7993 + *             and the amount of data output when writing.
7994 + * Returns:    100 if the module is disabled. Otherwise the value set by the
7995 + *             user via our sysfs entry.
7996 + */
7997 +
7998 +static int toi_compress_expected_ratio(void)
7999 +{
8000 +       if (!toi_compression_ops.enabled)
8001 +               return 100;
8002 +       else
8003 +               return 100 - toi_expected_compression;
8004 +}
8005 +
8006 +/*
8007 + * data for our sysfs entries.
8008 + */
8009 +static struct toi_sysfs_data sysfs_params[] = {
8010 +       {
8011 +               TOI_ATTR("expected_compression", SYSFS_RW),
8012 +               SYSFS_INT(&toi_expected_compression, 0, 99, 0)
8013 +       },
8014 +
8015 +       {
8016 +               TOI_ATTR("enabled", SYSFS_RW),
8017 +               SYSFS_INT(&toi_compression_ops.enabled, 0, 1, 0)
8018 +       },
8019 +
8020 +       {
8021 +               TOI_ATTR("algorithm", SYSFS_RW),
8022 +               SYSFS_STRING(toi_compressor_name, 31, 0)
8023 +       }
8024 +};
8025 +
8026 +/*
8027 + * Ops structure.
8028 + */
8029 +static struct toi_module_ops toi_compression_ops = {
8030 +       .type                   = FILTER_MODULE,
8031 +       .name                   = "compression",
8032 +       .directory              = "compression",
8033 +       .module                 = THIS_MODULE,
8034 +       .initialise             = toi_compress_init,
8035 +       .cleanup                = toi_compress_cleanup,
8036 +       .memory_needed          = toi_compress_memory_needed,
8037 +       .print_debug_info       = toi_compress_print_debug_stats,
8038 +       .save_config_info       = toi_compress_save_config_info,
8039 +       .load_config_info       = toi_compress_load_config_info,
8040 +       .storage_needed         = toi_compress_storage_needed,
8041 +       .expected_compression   = toi_compress_expected_ratio,
8042 +
8043 +       .rw_init                = toi_compress_rw_init,
8044 +
8045 +       .write_page             = toi_compress_write_page,
8046 +       .read_page              = toi_compress_read_page,
8047 +
8048 +       .sysfs_data             = sysfs_params,
8049 +       .num_sysfs_entries      = sizeof(sysfs_params) /
8050 +               sizeof(struct toi_sysfs_data),
8051 +};
8052 +
8053 +/* ---- Registration ---- */
8054 +
8055 +static __init int toi_compress_load(void)
8056 +{
8057 +       return toi_register_module(&toi_compression_ops);
8058 +}
8059 +
8060 +#ifdef MODULE
8061 +static __exit void toi_compress_unload(void)
8062 +{
8063 +       toi_unregister_module(&toi_compression_ops);
8064 +}
8065 +
8066 +module_init(toi_compress_load);
8067 +module_exit(toi_compress_unload);
8068 +MODULE_LICENSE("GPL");
8069 +MODULE_AUTHOR("Nigel Cunningham");
8070 +MODULE_DESCRIPTION("Compression Support for TuxOnIce");
8071 +#else
8072 +late_initcall(toi_compress_load);
8073 +#endif
8074 diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
8075 new file mode 100644
8076 index 0000000..be7b721
8077 --- /dev/null
8078 +++ b/kernel/power/tuxonice_extent.c
8079 @@ -0,0 +1,312 @@
8080 +/*
8081 + * kernel/power/tuxonice_extent.c
8082 + *
8083 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
8084 + *
8085 + * Distributed under GPLv2.
8086 + *
8087 + * These functions encapsulate the manipulation of storage metadata. For
8088 + * pageflags, we use dynamically allocated bitmaps.
8089 + */
8090 +
8091 +#include <linux/module.h>
8092 +#include <linux/suspend.h>
8093 +#include "tuxonice_modules.h"
8094 +#include "tuxonice_extent.h"
8095 +#include "tuxonice_alloc.h"
8096 +#include "tuxonice_ui.h"
8097 +#include "tuxonice.h"
8098 +
8099 +/* toi_get_extent
8100 + *
8101 + * Returns a free extent. May fail, returning NULL instead.
8102 + */
8103 +static struct extent *toi_get_extent(void)
8104 +{
8105 +       struct extent *result;
8106 +
8107 +       result = toi_kzalloc(2, sizeof(struct extent), TOI_ATOMIC_GFP);
8108 +       if (!result)
8109 +               return NULL;
8110 +
8111 +       result->minimum = result->maximum = 0;
8112 +       result->next = NULL;
8113 +
8114 +       return result;
8115 +}
8116 +
8117 +/* toi_put_extent_chain.
8118 + *
8119 + * Frees a whole chain of extents.
8120 + */
8121 +void toi_put_extent_chain(struct extent_chain *chain)
8122 +{
8123 +       struct extent *this;
8124 +
8125 +       this = chain->first;
8126 +
8127 +       while (this) {
8128 +               struct extent *next = this->next;
8129 +               toi_kfree(2, this);
8130 +               chain->num_extents--;
8131 +               this = next;
8132 +       }
8133 +
8134 +       chain->first = chain->last_touched = NULL;
8135 +       chain->size = 0;
8136 +}
8137 +
8138 +/*
8139 + * toi_add_to_extent_chain
8140 + *
8141 + * Add an extent to an existing chain.
8142 + */
8143 +int toi_add_to_extent_chain(struct extent_chain *chain,
8144 +               unsigned long minimum, unsigned long maximum)
8145 +{
8146 +       struct extent *new_extent = NULL, *start_at;
8147 +
8148 +       /* Find the right place in the chain */
8149 +       start_at = (chain->last_touched &&
8150 +                   (chain->last_touched->minimum < minimum)) ?
8151 +               chain->last_touched : NULL;
8152 +
8153 +       if (!start_at && chain->first && chain->first->minimum < minimum)
8154 +               start_at = chain->first;
8155 +
8156 +       while (start_at && start_at->next && start_at->next->minimum < minimum)
8157 +               start_at = start_at->next;
8158 +
8159 +       if (start_at && start_at->maximum == (minimum - 1)) {
8160 +               start_at->maximum = maximum;
8161 +
8162 +               /* Merge with the following one? */
8163 +               if (start_at->next &&
8164 +                   start_at->maximum + 1 == start_at->next->minimum) {
8165 +                       struct extent *to_free = start_at->next;
8166 +                       start_at->maximum = start_at->next->maximum;
8167 +                       start_at->next = start_at->next->next;
8168 +                       chain->num_extents--;
8169 +                       toi_kfree(2, to_free);
8170 +               }
8171 +
8172 +               chain->last_touched = start_at;
8173 +               chain->size += (maximum - minimum + 1);
8174 +
8175 +               return 0;
8176 +       }
8177 +
8178 +       new_extent = toi_get_extent();
8179 +       if (!new_extent) {
8180 +               printk(KERN_INFO "Error unable to append a new extent to the "
8181 +                               "chain.\n");
8182 +               return -ENOMEM;
8183 +       }
8184 +
8185 +       chain->num_extents++;
8186 +       chain->size += (maximum - minimum + 1);
8187 +       new_extent->minimum = minimum;
8188 +       new_extent->maximum = maximum;
8189 +       new_extent->next = NULL;
8190 +
8191 +       chain->last_touched = new_extent;
8192 +
8193 +       if (start_at) {
8194 +               struct extent *next = start_at->next;
8195 +               start_at->next = new_extent;
8196 +               new_extent->next = next;
8197 +       } else {
8198 +               if (chain->first)
8199 +                       new_extent->next = chain->first;
8200 +               chain->first = new_extent;
8201 +       }
8202 +
8203 +       return 0;
8204 +}
8205 +
8206 +/* toi_serialise_extent_chain
8207 + *
8208 + * Write a chain in the image.
8209 + */
8210 +int toi_serialise_extent_chain(struct toi_module_ops *owner,
8211 +               struct extent_chain *chain)
8212 +{
8213 +       struct extent *this;
8214 +       int ret, i = 0;
8215 +
8216 +       ret = toiActiveAllocator->rw_header_chunk(WRITE, owner, (char *) chain,
8217 +                       2 * sizeof(int));
8218 +       if (ret)
8219 +               return ret;
8220 +
8221 +       this = chain->first;
8222 +       while (this) {
8223 +               ret = toiActiveAllocator->rw_header_chunk(WRITE, owner,
8224 +                               (char *) this, 2 * sizeof(unsigned long));
8225 +               if (ret)
8226 +                       return ret;
8227 +               this = this->next;
8228 +               i++;
8229 +       }
8230 +
8231 +       if (i != chain->num_extents) {
8232 +               printk(KERN_EMERG "Saved %d extents but chain metadata says "
8233 +                       "there should be %d.\n", i, chain->num_extents);
8234 +               return 1;
8235 +       }
8236 +
8237 +       return ret;
8238 +}
8239 +
8240 +/* toi_load_extent_chain
8241 + *
8242 + * Read back a chain saved in the image.
8243 + */
8244 +int toi_load_extent_chain(struct extent_chain *chain)
8245 +{
8246 +       struct extent *this, *last = NULL;
8247 +       int i, ret;
8248 +
8249 +       ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
8250 +                       (char *) chain, 2 * sizeof(int));
8251 +       if (ret) {
8252 +               printk("Failed to read size of extent chain.\n");
8253 +               return 1;
8254 +       }
8255 +
8256 +       for (i = 0; i < chain->num_extents; i++) {
8257 +               this = toi_kzalloc(3, sizeof(struct extent), TOI_ATOMIC_GFP);
8258 +               if (!this) {
8259 +                       printk(KERN_INFO "Failed to allocate a new extent.\n");
8260 +                       return -ENOMEM;
8261 +               }
8262 +               this->next = NULL;
8263 +               ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
8264 +                               NULL, (char *) this, 2 * sizeof(unsigned long));
8265 +               if (ret) {
8266 +                       printk(KERN_INFO "Failed to an extent.\n");
8267 +                       return 1;
8268 +               }
8269 +               if (last)
8270 +                       last->next = this;
8271 +               else
8272 +                       chain->first = this;
8273 +               last = this;
8274 +       }
8275 +       return 0;
8276 +}
8277 +
8278 +/* toi_extent_state_next
8279 + *
8280 + * Given a state, progress to the next valid entry. We may begin in an
8281 + * invalid state, as we do when invoked after extent_state_goto_start below.
8282 + *
8283 + * When using compression and expected_compression > 0, we let the image size
8284 + * be larger than storage, so we can validly run out of data to return.
8285 + */
8286 +unsigned long toi_extent_state_next(struct extent_iterate_state *state)
8287 +{
8288 +       if (state->current_chain == state->num_chains)
8289 +               return 0;
8290 +
8291 +       if (state->current_extent) {
8292 +               if (state->current_offset == state->current_extent->maximum) {
8293 +                       if (state->current_extent->next) {
8294 +                               state->current_extent =
8295 +                                       state->current_extent->next;
8296 +                               state->current_offset =
8297 +                                       state->current_extent->minimum;
8298 +                       } else {
8299 +                               state->current_extent = NULL;
8300 +                               state->current_offset = 0;
8301 +                       }
8302 +               } else
8303 +                       state->current_offset++;
8304 +       }
8305 +
8306 +       while (!state->current_extent) {
8307 +               int chain_num = ++(state->current_chain);
8308 +
8309 +               if (chain_num == state->num_chains)
8310 +                       return 0;
8311 +
8312 +               state->current_extent = (state->chains + chain_num)->first;
8313 +
8314 +               if (!state->current_extent)
8315 +                       continue;
8316 +
8317 +               state->current_offset = state->current_extent->minimum;
8318 +       }
8319 +
8320 +       return state->current_offset;
8321 +}
8322 +
8323 +/* toi_extent_state_goto_start
8324 + *
8325 + * Find the first valid value in a group of chains.
8326 + */
8327 +void toi_extent_state_goto_start(struct extent_iterate_state *state)
8328 +{
8329 +       state->current_chain = -1;
8330 +       state->current_extent = NULL;
8331 +       state->current_offset = 0;
8332 +}
8333 +
8334 +/* toi_extent_start_save
8335 + *
8336 + * Given a state and a struct extent_state_store, save the current
8337 + * position in a format that can be used with relocated chains (at
8338 + * resume time).
8339 + */
8340 +void toi_extent_state_save(struct extent_iterate_state *state,
8341 +               struct extent_iterate_saved_state *saved_state)
8342 +{
8343 +       struct extent *extent;
8344 +
8345 +       saved_state->chain_num = state->current_chain;
8346 +       saved_state->extent_num = 0;
8347 +       saved_state->offset = state->current_offset;
8348 +
8349 +       if (saved_state->chain_num == -1)
8350 +               return;
8351 +
8352 +       extent = (state->chains + state->current_chain)->first;
8353 +
8354 +       while (extent != state->current_extent) {
8355 +               saved_state->extent_num++;
8356 +               extent = extent->next;
8357 +       }
8358 +}
8359 +
8360 +/* toi_extent_start_restore
8361 + *
8362 + * Restore the position saved by extent_state_save.
8363 + */
8364 +void toi_extent_state_restore(struct extent_iterate_state *state,
8365 +               struct extent_iterate_saved_state *saved_state)
8366 +{
8367 +       int posn = saved_state->extent_num;
8368 +
8369 +       if (saved_state->chain_num == -1) {
8370 +               toi_extent_state_goto_start(state);
8371 +               return;
8372 +       }
8373 +
8374 +       state->current_chain = saved_state->chain_num;
8375 +       state->current_extent = (state->chains + state->current_chain)->first;
8376 +       state->current_offset = saved_state->offset;
8377 +
8378 +       while (posn--)
8379 +               state->current_extent = state->current_extent->next;
8380 +}
8381 +
8382 +#ifdef CONFIG_TOI_EXPORTS
8383 +EXPORT_SYMBOL_GPL(toi_add_to_extent_chain);
8384 +EXPORT_SYMBOL_GPL(toi_put_extent_chain);
8385 +EXPORT_SYMBOL_GPL(toi_load_extent_chain);
8386 +EXPORT_SYMBOL_GPL(toi_serialise_extent_chain);
8387 +EXPORT_SYMBOL_GPL(toi_extent_state_save);
8388 +EXPORT_SYMBOL_GPL(toi_extent_state_restore);
8389 +EXPORT_SYMBOL_GPL(toi_extent_state_goto_start);
8390 +EXPORT_SYMBOL_GPL(toi_extent_state_next);
8391 +#endif
8392 diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
8393 new file mode 100644
8394 index 0000000..d7dd07e
8395 --- /dev/null
8396 +++ b/kernel/power/tuxonice_extent.h
8397 @@ -0,0 +1,78 @@
8398 +/*
8399 + * kernel/power/tuxonice_extent.h
8400 + *
8401 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
8402 + *
8403 + * This file is released under the GPLv2.
8404 + *
8405 + * It contains declarations related to extents. Extents are
8406 + * TuxOnIce's method of storing some of the metadata for the image.
8407 + * See tuxonice_extent.c for more info.
8408 + *
8409 + */
8410 +
8411 +#include "tuxonice_modules.h"
8412 +
8413 +#ifndef EXTENT_H
8414 +#define EXTENT_H
8415 +
8416 +struct extent {
8417 +       unsigned long minimum, maximum;
8418 +       struct extent *next;
8419 +};
8420 +
8421 +struct extent_chain {
8422 +       int size; /* size of the chain ie sum (max-min+1) */
8423 +       int num_extents;
8424 +       struct extent *first, *last_touched;
8425 +};
8426 +
8427 +struct extent_iterate_state {
8428 +       struct extent_chain *chains;
8429 +       int num_chains;
8430 +       int current_chain;
8431 +       struct extent *current_extent;
8432 +       unsigned long current_offset;
8433 +};
8434 +
8435 +struct extent_iterate_saved_state {
8436 +       int chain_num;
8437 +       int extent_num;
8438 +       unsigned long offset;
8439 +};
8440 +
8441 +#define toi_extent_state_eof(state) \
8442 +       ((state)->num_chains == (state)->current_chain)
8443 +
8444 +/* Simplify iterating through all the values in an extent chain */
8445 +#define toi_extent_for_each(extent_chain, extentpointer, value) \
8446 +if ((extent_chain)->first) \
8447 +       for ((extentpointer) = (extent_chain)->first, (value) = \
8448 +                       (extentpointer)->minimum; \
8449 +            ((extentpointer) && ((extentpointer)->next || (value) <= \
8450 +                                (extentpointer)->maximum)); \
8451 +            (((value) == (extentpointer)->maximum) ? \
8452 +               ((extentpointer) = (extentpointer)->next, (value) = \
8453 +                ((extentpointer) ? (extentpointer)->minimum : 0)) : \
8454 +                       (value)++))
8455 +
8456 +void toi_put_extent_chain(struct extent_chain *chain);
8457 +int toi_add_to_extent_chain(struct extent_chain *chain,
8458 +               unsigned long minimum, unsigned long maximum);
8459 +int toi_serialise_extent_chain(struct toi_module_ops *owner,
8460 +               struct extent_chain *chain);
8461 +int toi_load_extent_chain(struct extent_chain *chain);
8462 +
8463 +/* swap_entry_to_extent_val & extent_val_to_swap_entry:
8464 + * We are putting offset in the low bits so consecutive swap entries
8465 + * make consecutive extent values */
8466 +#define swap_entry_to_extent_val(swp_entry) (swp_entry.val)
8467 +#define extent_val_to_swap_entry(val) (swp_entry_t) { (val) }
8468 +
8469 +void toi_extent_state_save(struct extent_iterate_state *state,
8470 +               struct extent_iterate_saved_state *saved_state);
8471 +void toi_extent_state_restore(struct extent_iterate_state *state,
8472 +               struct extent_iterate_saved_state *saved_state);
8473 +void toi_extent_state_goto_start(struct extent_iterate_state *state);
8474 +unsigned long toi_extent_state_next(struct extent_iterate_state *state);
8475 +#endif
8476 diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
8477 new file mode 100644
8478 index 0000000..d702479
8479 --- /dev/null
8480 +++ b/kernel/power/tuxonice_file.c
8481 @@ -0,0 +1,1124 @@
8482 +/*
8483 + * kernel/power/tuxonice_file.c
8484 + *
8485 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
8486 + *
8487 + * Distributed under GPLv2.
8488 + *
8489 + * This file encapsulates functions for usage of a simple file as a
8490 + * backing store. It is based upon the swapallocator, and shares the
8491 + * same basic working. Here, though, we have nothing to do with
8492 + * swapspace, and only one device to worry about.
8493 + *
8494 + * The user can just
8495 + *
8496 + * echo TuxOnIce > /path/to/my_file
8497 + *
8498 + * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
8499 + *
8500 + * and
8501 + *
8502 + * echo /path/to/my_file > /sys/power/tuxonice/file/target
8503 + *
8504 + * then put what they find in /sys/power/tuxonice/resume
8505 + * as their resume= parameter in lilo.conf (and rerun lilo if using it).
8506 + *
8507 + * Having done this, they're ready to hibernate and resume.
8508 + *
8509 + * TODO:
8510 + * - File resizing.
8511 + */
8512 +
8513 +#include <linux/suspend.h>
8514 +#include <linux/module.h>
8515 +#include <linux/blkdev.h>
8516 +#include <linux/file.h>
8517 +#include <linux/stat.h>
8518 +#include <linux/mount.h>
8519 +#include <linux/statfs.h>
8520 +#include <linux/syscalls.h>
8521 +#include <linux/namei.h>
8522 +#include <linux/fs.h>
8523 +#include <linux/root_dev.h>
8524 +
8525 +#include "tuxonice.h"
8526 +#include "tuxonice_sysfs.h"
8527 +#include "tuxonice_modules.h"
8528 +#include "tuxonice_ui.h"
8529 +#include "tuxonice_extent.h"
8530 +#include "tuxonice_io.h"
8531 +#include "tuxonice_storage.h"
8532 +#include "tuxonice_block_io.h"
8533 +#include "tuxonice_alloc.h"
8534 +
8535 +static struct toi_module_ops toi_fileops;
8536 +
8537 +/* Details of our target.  */
8538 +
8539 +char toi_file_target[256];
8540 +static struct inode *target_inode;
8541 +static struct file *target_file;
8542 +static struct block_device *toi_file_target_bdev;
8543 +static dev_t resume_file_dev_t;
8544 +static int used_devt;
8545 +static int setting_toi_file_target;
8546 +static sector_t target_firstblock, target_header_start;
8547 +static int target_storage_available;
8548 +static int target_claim;
8549 +
8550 +/* Old signatures */
8551 +static char HaveImage[] = "HaveImage\n";
8552 +static char NoImage[] =   "TuxOnIce\n";
8553 +#define sig_size (sizeof(HaveImage) + 1)
8554 +
8555 +struct toi_file_header {
8556 +       char sig[sig_size];
8557 +       int resumed_before;
8558 +       unsigned long first_header_block;
8559 +       int have_image;
8560 +};
8561 +
8562 +/* Header Page Information */
8563 +static int header_pages_reserved;
8564 +
8565 +/* Main Storage Pages */
8566 +static int main_pages_allocated, main_pages_requested;
8567 +
8568 +#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
8569 +
8570 +static struct toi_bdev_info devinfo;
8571 +
8572 +/* Extent chain for blocks */
8573 +static struct extent_chain block_chain;
8574 +
8575 +/* Signature operations */
8576 +enum {
8577 +       GET_IMAGE_EXISTS,
8578 +       INVALIDATE,
8579 +       MARK_RESUME_ATTEMPTED,
8580 +       UNMARK_RESUME_ATTEMPTED,
8581 +};
8582 +
8583 +static void set_devinfo(struct block_device *bdev, int target_blkbits)
8584 +{
8585 +       devinfo.bdev = bdev;
8586 +       if (!target_blkbits) {
8587 +               devinfo.bmap_shift = devinfo.blocks_per_page = 0;
8588 +       } else {
8589 +               devinfo.bmap_shift = target_blkbits - 9;
8590 +               devinfo.blocks_per_page = (1 << (PAGE_SHIFT - target_blkbits));
8591 +       }
8592 +}
8593 +
8594 +static long raw_to_real(long raw)
8595 +{
8596 +       long result;
8597 +
8598 +       result = raw - (raw * (sizeof(unsigned long) + sizeof(int)) +
8599 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
8600 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
8601 +
8602 +       return result < 0 ? 0 : result;
8603 +}
8604 +
8605 +static int toi_file_storage_available(void)
8606 +{
8607 +       int result = 0;
8608 +       struct block_device *bdev = toi_file_target_bdev;
8609 +
8610 +       if (!target_inode)
8611 +               return 0;
8612 +
8613 +       switch (target_inode->i_mode & S_IFMT) {
8614 +       case S_IFSOCK:
8615 +       case S_IFCHR:
8616 +       case S_IFIFO: /* Socket, Char, Fifo */
8617 +               return -1;
8618 +       case S_IFREG: /* Regular file: current size - holes + free
8619 +                        space on part */
8620 +               result = target_storage_available;
8621 +               break;
8622 +       case S_IFBLK: /* Block device */
8623 +               if (!bdev->bd_disk) {
8624 +                       printk(KERN_INFO "bdev->bd_disk null.\n");
8625 +                       return 0;
8626 +               }
8627 +
8628 +               result = (bdev->bd_part ?
8629 +                       bdev->bd_part->nr_sects :
8630 +                       bdev->bd_disk->capacity) >> (PAGE_SHIFT - 9);
8631 +       }
8632 +
8633 +       return raw_to_real(result);
8634 +}
8635 +
8636 +static int has_contiguous_blocks(int page_num)
8637 +{
8638 +       int j;
8639 +       sector_t last = 0;
8640 +
8641 +       for (j = 0; j < devinfo.blocks_per_page; j++) {
8642 +               sector_t this = bmap(target_inode,
8643 +                               page_num * devinfo.blocks_per_page + j);
8644 +
8645 +               if (!this || (last && (last + 1) != this))
8646 +                       break;
8647 +
8648 +               last = this;
8649 +       }
8650 +
8651 +       return (j == devinfo.blocks_per_page);
8652 +}
8653 +
8654 +static int size_ignoring_ignored_pages(void)
8655 +{
8656 +       int mappable = 0, i;
8657 +
8658 +       if (!target_is_normal_file())
8659 +               return toi_file_storage_available();
8660 +
8661 +       for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++)
8662 +               if (has_contiguous_blocks(i))
8663 +                       mappable++;
8664 +
8665 +       return mappable;
8666 +}
8667 +
8668 +static int __populate_block_list(int min, int max)
8669 +{
8670 +       if (test_action_state(TOI_TEST_BIO))
8671 +               printk(KERN_INFO "Adding extent %d-%d.\n",
8672 +                       min << devinfo.bmap_shift,
8673 +                       ((max + 1) << devinfo.bmap_shift) - 1);
8674 +
8675 +       return toi_add_to_extent_chain(&block_chain, min, max);
8676 +}
8677 +
8678 +static int apply_header_reservation(void)
8679 +{
8680 +       int i;
8681 +
8682 +       /* Apply header space reservation */
8683 +       toi_extent_state_goto_start(&toi_writer_posn);
8684 +       toi_bio_ops.forward_one_page(1); /* To first page */
8685 +
8686 +       for (i = 0; i < header_pages_reserved; i++)
8687 +               if (toi_bio_ops.forward_one_page(1))
8688 +                       return -ENOSPC;
8689 +
8690 +       /* The end of header pages will be the start of pageset 2 */
8691 +       toi_extent_state_save(&toi_writer_posn, &toi_writer_posn_save[2]);
8692 +
8693 +       return 0;
8694 +}
8695 +
8696 +static int populate_block_list(void)
8697 +{
8698 +       int i, extent_min = -1, extent_max = -1, got_header = 0, result = 0;
8699 +
8700 +       if (block_chain.first)
8701 +               toi_put_extent_chain(&block_chain);
8702 +
8703 +       if (!target_is_normal_file()) {
8704 +               return (target_storage_available > 0) ?
8705 +                       __populate_block_list(devinfo.blocks_per_page,
8706 +                               (target_storage_available + 1) *
8707 +                               devinfo.blocks_per_page - 1) : 0;
8708 +       }
8709 +
8710 +       for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
8711 +               sector_t new_sector;
8712 +
8713 +               if (!has_contiguous_blocks(i))
8714 +                       continue;
8715 +
8716 +               new_sector = bmap(target_inode,
8717 +               (i * devinfo.blocks_per_page));
8718 +
8719 +               /*
8720 +                * Ignore the first block in the file.
8721 +                * It gets the header.
8722 +                */
8723 +               if (new_sector == target_firstblock >> devinfo.bmap_shift) {
8724 +                       got_header = 1;
8725 +                       continue;
8726 +               }
8727 +
8728 +               /*
8729 +                * I'd love to be able to fill in holes and resize
8730 +                * files, but not yet...
8731 +                */
8732 +
8733 +               if (new_sector == extent_max + 1)
8734 +                       extent_max += devinfo.blocks_per_page;
8735 +               else {
8736 +                       if (extent_min > -1) {
8737 +                               result = __populate_block_list(extent_min,
8738 +                                               extent_max);
8739 +                               if (result)
8740 +                                       return result;
8741 +                       }
8742 +
8743 +                       extent_min = new_sector;
8744 +                       extent_max = extent_min +
8745 +                               devinfo.blocks_per_page - 1;
8746 +               }
8747 +       }
8748 +
8749 +       if (extent_min > -1) {
8750 +               result = __populate_block_list(extent_min, extent_max);
8751 +               if (result)
8752 +                       return result;
8753 +       }
8754 +
8755 +       return apply_header_reservation();
8756 +}
8757 +
8758 +static void toi_file_cleanup(int finishing_cycle)
8759 +{
8760 +       if (toi_file_target_bdev) {
8761 +               if (target_claim) {
8762 +                       bd_release(toi_file_target_bdev);
8763 +                       target_claim = 0;
8764 +               }
8765 +
8766 +               if (used_devt) {
8767 +                       blkdev_put(toi_file_target_bdev);
8768 +                       used_devt = 0;
8769 +               }
8770 +               toi_file_target_bdev = NULL;
8771 +               target_inode = NULL;
8772 +               set_devinfo(NULL, 0);
8773 +               target_storage_available = 0;
8774 +       }
8775 +
8776 +       if (target_file > 0) {
8777 +               filp_close(target_file, NULL);
8778 +               target_file = NULL;
8779 +       }
8780 +}
8781 +
8782 +/*
8783 + * reopen_resume_devt
8784 + *
8785 + * Having opened resume= once, we remember the major and
8786 + * minor nodes and use them to reopen the bdev for checking
8787 + * whether an image exists (possibly when starting a resume).
8788 + */
8789 +static void reopen_resume_devt(void)
8790 +{
8791 +       toi_file_target_bdev = toi_open_by_devnum(resume_file_dev_t,
8792 +                       FMODE_READ);
8793 +       if (IS_ERR(toi_file_target_bdev)) {
8794 +               printk(KERN_INFO "Got a dev_num (%lx) but failed to open it.\n",
8795 +                               (unsigned long) resume_file_dev_t);
8796 +               return;
8797 +       }
8798 +       target_inode = toi_file_target_bdev->bd_inode;
8799 +       set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
8800 +}
8801 +
8802 +static void toi_file_get_target_info(char *target, int get_size,
8803 +               int resume_param)
8804 +{
8805 +       if (target_file)
8806 +               toi_file_cleanup(0);
8807 +
8808 +       if (!target || !strlen(target))
8809 +               return;
8810 +
8811 +       target_file = filp_open(target, O_RDWR|O_LARGEFILE, 0);
8812 +
8813 +       if (IS_ERR(target_file) || !target_file) {
8814 +
8815 +               if (!resume_param) {
8816 +                       printk(KERN_INFO "Open file %s returned %p.\n",
8817 +                                       target, target_file);
8818 +                       target_file = NULL;
8819 +                       return;
8820 +               }
8821 +
8822 +               target_file = NULL;
8823 +               resume_file_dev_t = name_to_dev_t(target);
8824 +               if (!resume_file_dev_t) {
8825 +                       struct kstat stat;
8826 +                       int error = vfs_stat(target, &stat);
8827 +                       printk(KERN_INFO "Open file %s returned %p and "
8828 +                                       "name_to_devt failed.\n", target,
8829 +                                       target_file);
8830 +                       if (error)
8831 +                               printk(KERN_INFO "Stating the file also failed."
8832 +                                       " Nothing more we can do.\n");
8833 +                       else
8834 +                               resume_file_dev_t = stat.rdev;
8835 +                       return;
8836 +               }
8837 +
8838 +               toi_file_target_bdev = toi_open_by_devnum(resume_file_dev_t,
8839 +                               FMODE_READ);
8840 +               if (IS_ERR(toi_file_target_bdev)) {
8841 +                       printk(KERN_INFO "Got a dev_num (%lx) but failed to "
8842 +                                       "open it.\n",
8843 +                                       (unsigned long) resume_file_dev_t);
8844 +                       return;
8845 +               }
8846 +               used_devt = 1;
8847 +               target_inode = toi_file_target_bdev->bd_inode;
8848 +       } else
8849 +               target_inode = target_file->f_mapping->host;
8850 +
8851 +       if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
8852 +           S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
8853 +               printk(KERN_INFO "File support works with regular files,"
8854 +                               " character files and block devices.\n");
8855 +               goto cleanup;
8856 +       }
8857 +
8858 +       if (!used_devt) {
8859 +               if (S_ISBLK(target_inode->i_mode)) {
8860 +                       toi_file_target_bdev = I_BDEV(target_inode);
8861 +                       if (!bd_claim(toi_file_target_bdev, &toi_fileops))
8862 +                               target_claim = 1;
8863 +               } else
8864 +                       toi_file_target_bdev = target_inode->i_sb->s_bdev;
8865 +               resume_file_dev_t = toi_file_target_bdev->bd_dev;
8866 +       }
8867 +
8868 +       set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
8869 +
8870 +       if (get_size)
8871 +               target_storage_available = size_ignoring_ignored_pages();
8872 +
8873 +       if (!resume_param)
8874 +               target_firstblock = bmap(target_inode, 0) << devinfo.bmap_shift;
8875 +
8876 +       return;
8877 +cleanup:
8878 +       target_inode = NULL;
8879 +       if (target_file) {
8880 +               filp_close(target_file, NULL);
8881 +               target_file = NULL;
8882 +       }
8883 +       set_devinfo(NULL, 0);
8884 +       target_storage_available = 0;
8885 +}
8886 +
8887 +static void toi_file_noresume_reset(void)
8888 +{
8889 +       toi_bio_ops.rw_cleanup(READ);
8890 +}
8891 +
8892 +static int parse_signature(struct toi_file_header *header)
8893 +{
8894 +       int have_image = !memcmp(HaveImage, header->sig, sizeof(HaveImage) - 1);
8895 +       int no_image_header = !memcmp(NoImage, header->sig,
8896 +                       sizeof(NoImage) - 1);
8897 +       int binary_sig = !memcmp(tuxonice_signature, header->sig,
8898 +                       sizeof(tuxonice_signature));
8899 +
8900 +       if (no_image_header || (binary_sig && !header->have_image))
8901 +               return 0;
8902 +
8903 +       if (!have_image && !binary_sig)
8904 +               return -1;
8905 +
8906 +       if (header->resumed_before)
8907 +               set_toi_state(TOI_RESUMED_BEFORE);
8908 +       else
8909 +               clear_toi_state(TOI_RESUMED_BEFORE);
8910 +
8911 +       target_header_start = header->first_header_block;
8912 +       return 1;
8913 +}
8914 +
8915 +/* prepare_signature */
8916 +
8917 +static int prepare_signature(struct toi_file_header *current_header,
8918 +               unsigned long first_header_block)
8919 +{
8920 +       strncpy(current_header->sig, tuxonice_signature,
8921 +                       sizeof(tuxonice_signature));
8922 +       current_header->resumed_before = 0;
8923 +       current_header->first_header_block = first_header_block;
8924 +       current_header->have_image = 1;
8925 +       return 0;
8926 +}
8927 +
8928 +static int toi_file_storage_allocated(void)
8929 +{
8930 +       if (!target_inode)
8931 +               return 0;
8932 +
8933 +       if (target_is_normal_file())
8934 +               return (int) raw_to_real(target_storage_available);
8935 +       else
8936 +               return (int) raw_to_real(main_pages_requested);
8937 +}
8938 +
8939 +static int toi_file_release_storage(void)
8940 +{
8941 +       if (test_action_state(TOI_KEEP_IMAGE) &&
8942 +           test_toi_state(TOI_NOW_RESUMING))
8943 +               return 0;
8944 +
8945 +       toi_put_extent_chain(&block_chain);
8946 +
8947 +       header_pages_reserved = 0;
8948 +       main_pages_allocated = 0;
8949 +       main_pages_requested = 0;
8950 +       return 0;
8951 +}
8952 +
8953 +static void toi_file_reserve_header_space(int request)
8954 +{
8955 +       header_pages_reserved = request;
8956 +       apply_header_reservation();
8957 +}
8958 +
8959 +static int toi_file_allocate_storage(int main_space_requested)
8960 +{
8961 +       int result = 0;
8962 +
8963 +       int extra_pages = DIV_ROUND_UP(main_space_requested *
8964 +                       (sizeof(unsigned long) + sizeof(int)), PAGE_SIZE);
8965 +       int pages_to_get = main_space_requested + extra_pages +
8966 +               header_pages_reserved;
8967 +       int blocks_to_get = pages_to_get - block_chain.size;
8968 +
8969 +       /* Only release_storage reduces the size */
8970 +       if (blocks_to_get < 1)
8971 +               return 0;
8972 +
8973 +       result = populate_block_list();
8974 +
8975 +       if (result)
8976 +               return result;
8977 +
8978 +       toi_message(TOI_WRITER, TOI_MEDIUM, 0,
8979 +               "Finished with block_chain.size == %d.\n",
8980 +               block_chain.size);
8981 +
8982 +       if (block_chain.size < pages_to_get) {
8983 +               printk("Block chain size (%d) < header pages (%d) + extra "
8984 +                       "pages (%d) + main pages (%d) (=%d pages).\n",
8985 +                       block_chain.size, header_pages_reserved, extra_pages,
8986 +                       main_space_requested, pages_to_get);
8987 +               result = -ENOSPC;
8988 +       }
8989 +
8990 +       main_pages_requested = main_space_requested;
8991 +       main_pages_allocated = main_space_requested + extra_pages;
8992 +       return result;
8993 +}
8994 +
8995 +static int toi_file_write_header_init(void)
8996 +{
8997 +       int result;
8998 +
8999 +       toi_bio_ops.rw_init(WRITE, 0);
9000 +       toi_writer_buffer_posn = 0;
9001 +
9002 +       /* Info needed to bootstrap goes at the start of the header.
9003 +        * First we save the basic info needed for reading, including the number
9004 +        * of header pages. Then we save the structs containing data needed
9005 +        * for reading the header pages back.
9006 +        * Note that even if header pages take more than one page, when we
9007 +        * read back the info, we will have restored the location of the
9008 +        * next header page by the time we go to use it.
9009 +        */
9010 +
9011 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_fileops,
9012 +                       (char *) &toi_writer_posn_save,
9013 +                       sizeof(toi_writer_posn_save));
9014 +
9015 +       if (result)
9016 +               return result;
9017 +
9018 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_fileops,
9019 +                       (char *) &devinfo, sizeof(devinfo));
9020 +
9021 +       if (result)
9022 +               return result;
9023 +
9024 +       toi_serialise_extent_chain(&toi_fileops, &block_chain);
9025 +
9026 +       return 0;
9027 +}
9028 +
9029 +static int toi_file_write_header_cleanup(void)
9030 +{
9031 +       struct toi_file_header *header;
9032 +       int result;
9033 +       unsigned long sig_page = toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
9034 +
9035 +       /* Write any unsaved data */
9036 +       if (toi_writer_buffer_posn)
9037 +               toi_bio_ops.write_header_chunk_finish();
9038 +
9039 +       toi_bio_ops.finish_all_io();
9040 +
9041 +       toi_extent_state_goto_start(&toi_writer_posn);
9042 +       toi_bio_ops.forward_one_page(1);
9043 +
9044 +       /* Adjust image header */
9045 +       result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
9046 +                       target_firstblock,
9047 +                       virt_to_page(sig_page));
9048 +       if (result)
9049 +               goto out;
9050 +
9051 +       header = (struct toi_file_header *) sig_page;
9052 +
9053 +       prepare_signature(header,
9054 +                       toi_writer_posn.current_offset <<
9055 +                       devinfo.bmap_shift);
9056 +
9057 +       result = toi_bio_ops.bdev_page_io(WRITE, toi_file_target_bdev,
9058 +                       target_firstblock,
9059 +                       virt_to_page(sig_page));
9060 +
9061 +out:
9062 +       toi_bio_ops.finish_all_io();
9063 +       toi_free_page(38, sig_page);
9064 +
9065 +       return result;
9066 +}
9067 +
9068 +/* HEADER READING */
9069 +
9070 +/*
9071 + * read_header_init()
9072 + *
9073 + * Description:
9074 + * 1. Attempt to read the device specified with resume=.
9075 + * 2. Check the contents of the header for our signature.
9076 + * 3. Warn, ignore, reset and/or continue as appropriate.
9077 + * 4. If continuing, read the toi_file configuration section
9078 + *    of the header and set up block device info so we can read
9079 + *    the rest of the header & image.
9080 + *
9081 + * Returns:
9082 + * May not return if user choose to reboot at a warning.
9083 + * -EINVAL if cannot resume at this time. Booting should continue
9084 + * normally.
9085 + */
9086 +
9087 +static int toi_file_read_header_init(void)
9088 +{
9089 +       int result;
9090 +       struct block_device *tmp;
9091 +
9092 +       toi_bio_ops.read_header_init();
9093 +
9094 +       /* Read toi_file configuration */
9095 +       result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
9096 +                       target_header_start,
9097 +                       virt_to_page((unsigned long) toi_writer_buffer));
9098 +
9099 +       if (result) {
9100 +               printk("FileAllocator read header init: Failed to initialise "
9101 +                               "reading the first page of data.\n");
9102 +               toi_bio_ops.rw_cleanup(READ);
9103 +               return result;
9104 +       }
9105 +
9106 +       memcpy(&toi_writer_posn_save, toi_writer_buffer,
9107 +              sizeof(toi_writer_posn_save));
9108 +
9109 +       toi_writer_buffer_posn = sizeof(toi_writer_posn_save);
9110 +
9111 +       tmp = devinfo.bdev;
9112 +
9113 +       memcpy(&devinfo,
9114 +              toi_writer_buffer + toi_writer_buffer_posn,
9115 +              sizeof(devinfo));
9116 +
9117 +       devinfo.bdev = tmp;
9118 +       toi_writer_buffer_posn += sizeof(devinfo);
9119 +
9120 +       toi_extent_state_goto_start(&toi_writer_posn);
9121 +       toi_bio_ops.set_extra_page_forward();
9122 +
9123 +       return toi_load_extent_chain(&block_chain);
9124 +}
9125 +
9126 +static int toi_file_read_header_cleanup(void)
9127 +{
9128 +       toi_bio_ops.rw_cleanup(READ);
9129 +       return 0;
9130 +}
9131 +
9132 +static int toi_file_signature_op(int op)
9133 +{
9134 +       char *cur;
9135 +       int result = 0, changed = 0;
9136 +       struct toi_file_header *header;
9137 +
9138 +       if (toi_file_target_bdev <= 0)
9139 +               return -1;
9140 +
9141 +       cur = (char *) toi_get_zeroed_page(17, TOI_ATOMIC_GFP);
9142 +       if (!cur) {
9143 +               printk("Unable to allocate a page for reading the image "
9144 +                               "signature.\n");
9145 +               return -ENOMEM;
9146 +       }
9147 +
9148 +       result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
9149 +                       target_firstblock,
9150 +                       virt_to_page(cur));
9151 +
9152 +       if (result)
9153 +               goto out;
9154 +
9155 +       header = (struct toi_file_header *) cur;
9156 +       result = parse_signature(header);
9157 +
9158 +       switch (op) {
9159 +       case INVALIDATE:
9160 +               if (result == -1)
9161 +                       goto out;
9162 +
9163 +               strcpy(header->sig, tuxonice_signature);
9164 +               header->resumed_before = 0;
9165 +               header->have_image = 0;
9166 +               result = changed = 1;
9167 +               break;
9168 +       case MARK_RESUME_ATTEMPTED:
9169 +               if (result == 1) {
9170 +                       header->resumed_before = 1;
9171 +                       changed = 1;
9172 +               }
9173 +               break;
9174 +       case UNMARK_RESUME_ATTEMPTED:
9175 +               if (result == 1) {
9176 +                       header->resumed_before = 0;
9177 +                       changed = 1;
9178 +               }
9179 +               break;
9180 +       }
9181 +
9182 +       if (changed) {
9183 +               int io_result = toi_bio_ops.bdev_page_io(WRITE,
9184 +                               toi_file_target_bdev, target_firstblock,
9185 +                               virt_to_page(cur));
9186 +               if (io_result)
9187 +                       result = io_result;
9188 +       }
9189 +
9190 +out:
9191 +       toi_bio_ops.finish_all_io();
9192 +       toi_free_page(17, (unsigned long) cur);
9193 +       return result;
9194 +}
9195 +
9196 +/* Print debug info
9197 + *
9198 + * Description:
9199 + */
9200 +
9201 +static int toi_file_print_debug_stats(char *buffer, int size)
9202 +{
9203 +       int len = 0;
9204 +
9205 +       if (toiActiveAllocator != &toi_fileops) {
9206 +               len = snprintf_used(buffer, size,
9207 +                               "- FileAllocator inactive.\n");
9208 +               return len;
9209 +       }
9210 +
9211 +       len = snprintf_used(buffer, size, "- FileAllocator active.\n");
9212 +
9213 +       len += snprintf_used(buffer+len, size-len, "  Storage available for "
9214 +                       "image: %ld pages.\n",
9215 +                       toi_file_storage_allocated());
9216 +
9217 +       return len;
9218 +}
9219 +
9220 +/*
9221 + * Storage needed
9222 + *
9223 + * Returns amount of space in the image header required
9224 + * for the toi_file's data.
9225 + *
9226 + * We ensure the space is allocated, but actually save the
9227 + * data from write_header_init and therefore don't also define a
9228 + * save_config_info routine.
9229 + */
9230 +static int toi_file_storage_needed(void)
9231 +{
9232 +       return sig_size + strlen(toi_file_target) + 1 +
9233 +               sizeof(toi_writer_posn_save) +
9234 +               sizeof(devinfo) +
9235 +               sizeof(struct extent_chain) - 2 * sizeof(void *) +
9236 +               (2 * sizeof(unsigned long) * block_chain.num_extents);
9237 +}
9238 +
9239 +/*
9240 + * toi_file_remove_image
9241 + *
9242 + */
9243 +static int toi_file_remove_image(void)
9244 +{
9245 +       toi_file_release_storage();
9246 +       return toi_file_signature_op(INVALIDATE);
9247 +}
9248 +
9249 +/*
9250 + * Image_exists
9251 + *
9252 + */
9253 +
9254 +static int toi_file_image_exists(int quiet)
9255 +{
9256 +       if (!toi_file_target_bdev)
9257 +               reopen_resume_devt();
9258 +
9259 +       return toi_file_signature_op(GET_IMAGE_EXISTS);
9260 +}
9261 +
9262 +/*
9263 + * Mark resume attempted.
9264 + *
9265 + * Record that we tried to resume from this image.
9266 + */
9267 +
9268 +static int toi_file_mark_resume_attempted(int mark)
9269 +{
9270 +       return toi_file_signature_op(mark ? MARK_RESUME_ATTEMPTED:
9271 +               UNMARK_RESUME_ATTEMPTED);
9272 +}
9273 +
9274 +static void toi_file_set_resume_param(void)
9275 +{
9276 +       char *buffer = (char *) toi_get_zeroed_page(18, TOI_ATOMIC_GFP);
9277 +       char *buffer2 = (char *) toi_get_zeroed_page(19, TOI_ATOMIC_GFP);
9278 +       unsigned long sector = bmap(target_inode, 0);
9279 +       int offset = 0;
9280 +
9281 +       if (!buffer || !buffer2) {
9282 +               if (buffer)
9283 +                       toi_free_page(18, (unsigned long) buffer);
9284 +               if (buffer2)
9285 +                       toi_free_page(19, (unsigned long) buffer2);
9286 +               printk("TuxOnIce: Failed to allocate memory while setting "
9287 +                               "resume= parameter.\n");
9288 +               return;
9289 +       }
9290 +
9291 +       if (toi_file_target_bdev) {
9292 +               set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
9293 +
9294 +               bdevname(toi_file_target_bdev, buffer2);
9295 +               offset += snprintf(buffer + offset, PAGE_SIZE - offset,
9296 +                               "/dev/%s", buffer2);
9297 +
9298 +               if (sector)
9299 +                       offset += snprintf(buffer + offset, PAGE_SIZE - offset,
9300 +                               ":0x%lx", sector << devinfo.bmap_shift);
9301 +       } else
9302 +               offset += snprintf(buffer + offset, PAGE_SIZE - offset,
9303 +                               "%s is not a valid target.", toi_file_target);
9304 +
9305 +       sprintf(resume_file, "file:%s", buffer);
9306 +
9307 +       toi_free_page(18, (unsigned long) buffer);
9308 +       toi_free_page(19, (unsigned long) buffer2);
9309 +
9310 +       toi_attempt_to_parse_resume_device(1);
9311 +}
9312 +
9313 +static int __test_toi_file_target(char *target, int resume_time, int quiet)
9314 +{
9315 +       toi_file_get_target_info(target, 0, resume_time);
9316 +       if (toi_file_signature_op(GET_IMAGE_EXISTS) > -1) {
9317 +               if (!quiet)
9318 +                       printk(KERN_INFO "TuxOnIce: FileAllocator: File "
9319 +                                       "signature found.\n");
9320 +               if (!resume_time)
9321 +                       toi_file_set_resume_param();
9322 +
9323 +               toi_bio_ops.set_devinfo(&devinfo);
9324 +               toi_writer_posn.chains = &block_chain;
9325 +               toi_writer_posn.num_chains = 1;
9326 +
9327 +               if (!resume_time)
9328 +                       set_toi_state(TOI_CAN_HIBERNATE);
9329 +               return 0;
9330 +       }
9331 +
9332 +       clear_toi_state(TOI_CAN_HIBERNATE);
9333 +
9334 +       if (quiet)
9335 +               return 1;
9336 +
9337 +       if (*target)
9338 +               printk(KERN_INFO "TuxOnIce: FileAllocator: Sorry. No signature "
9339 +                               "found at  %s.\n", target);
9340 +       else
9341 +               if (!resume_time)
9342 +                       printk(KERN_INFO "TuxOnIce: FileAllocator: Sorry. "
9343 +                                       "Target is not set for hibernating.\n");
9344 +
9345 +       return 1;
9346 +}
9347 +
9348 +static void test_toi_file_target(void)
9349 +{
9350 +       setting_toi_file_target = 1;
9351 +
9352 +       printk(KERN_INFO "TuxOnIce: Hibernating %sabled.\n",
9353 +                       __test_toi_file_target(toi_file_target, 0, 1) ?
9354 +                       "dis" : "en");
9355 +
9356 +       setting_toi_file_target = 0;
9357 +}
9358 +
9359 +/*
9360 + * Parse Image Location
9361 + *
9362 + * Attempt to parse a resume= parameter.
9363 + * File Allocator accepts:
9364 + * resume=file:DEVNAME[:FIRSTBLOCK]
9365 + *
9366 + * Where:
9367 + * DEVNAME is convertable to a dev_t by name_to_dev_t
9368 + * FIRSTBLOCK is the location of the first block in the file.
9369 + * BLOCKSIZE is the logical blocksize >= SECTOR_SIZE & <= PAGE_SIZE,
9370 + * mod SECTOR_SIZE == 0 of the device.
9371 + * Data is validated by attempting to read a header from the
9372 + * location given. Failure will result in toi_file refusing to
9373 + * save an image, and a reboot with correct parameters will be
9374 + * necessary.
9375 + */
9376 +
9377 +static int toi_file_parse_sig_location(char *commandline,
9378 +               int only_writer, int quiet)
9379 +{
9380 +       char *thischar, *devstart = NULL, *colon = NULL, *at_symbol = NULL;
9381 +       int result = -EINVAL, target_blocksize = 0;
9382 +
9383 +       if (strncmp(commandline, "file:", 5)) {
9384 +               if (!only_writer)
9385 +                       return 1;
9386 +       } else
9387 +               commandline += 5;
9388 +
9389 +       /*
9390 +        * Don't check signature again if we're beginning a cycle. If we already
9391 +        * did the initialisation successfully, assume we'll be okay when it
9392 +        * comes to resuming.
9393 +        */
9394 +       if (toi_file_target_bdev)
9395 +               return 0;
9396 +
9397 +       devstart = thischar = commandline;
9398 +       while ((*thischar != ':') && (*thischar != '@') &&
9399 +               ((thischar - commandline) < 250) && (*thischar))
9400 +               thischar++;
9401 +
9402 +       if (*thischar == ':') {
9403 +               colon = thischar;
9404 +               *colon = 0;
9405 +               thischar++;
9406 +       }
9407 +
9408 +       while ((*thischar != '@') && ((thischar - commandline) < 250)
9409 +                       && (*thischar))
9410 +               thischar++;
9411 +
9412 +       if (*thischar == '@') {
9413 +               at_symbol = thischar;
9414 +               *at_symbol = 0;
9415 +       }
9416 +
9417 +       /*
9418 +        * For the toi_file, you can be able to resume, but not hibernate,
9419 +        * because the resume= is set correctly, but the toi_file_target
9420 +        * isn't.
9421 +        *
9422 +        * We may have come here as a result of setting resume or
9423 +        * toi_file_target. We only test the toi_file target in the
9424 +        * former case (it's already done in the later), and we do it before
9425 +        * setting the block number ourselves. It will overwrite the values
9426 +        * given on the command line if we don't.
9427 +        */
9428 +
9429 +       if (!setting_toi_file_target)
9430 +               __test_toi_file_target(toi_file_target, 1, 0);
9431 +
9432 +       if (colon)
9433 +               target_firstblock = (int) simple_strtoul(colon + 1, NULL, 0);
9434 +       else
9435 +               target_firstblock = 0;
9436 +
9437 +       if (at_symbol) {
9438 +               target_blocksize = (int) simple_strtoul(at_symbol + 1, NULL, 0);
9439 +               if (target_blocksize & (SECTOR_SIZE - 1)) {
9440 +                       printk(KERN_INFO "FileAllocator: Blocksizes are "
9441 +                                       "multiples of %d.\n", SECTOR_SIZE);
9442 +                       result = -EINVAL;
9443 +                       goto out;
9444 +               }
9445 +       }
9446 +
9447 +       if (!quiet)
9448 +               printk(KERN_INFO "TuxOnIce FileAllocator: Testing whether you"
9449 +                               " can resume:\n");
9450 +
9451 +       toi_file_get_target_info(commandline, 0, 1);
9452 +
9453 +       if (!toi_file_target_bdev || IS_ERR(toi_file_target_bdev)) {
9454 +               toi_file_target_bdev = NULL;
9455 +               result = -1;
9456 +               goto out;
9457 +       }
9458 +
9459 +       if (target_blocksize)
9460 +               set_devinfo(toi_file_target_bdev, ffs(target_blocksize));
9461 +
9462 +       result = __test_toi_file_target(commandline, 1, 0);
9463 +
9464 +out:
9465 +       if (result)
9466 +               clear_toi_state(TOI_CAN_HIBERNATE);
9467 +
9468 +       if (!quiet)
9469 +               printk(KERN_INFO "Resuming %sabled.\n",  result ? "dis" : "en");
9470 +
9471 +       if (colon)
9472 +               *colon = ':';
9473 +       if (at_symbol)
9474 +               *at_symbol = '@';
9475 +
9476 +       return result;
9477 +}
9478 +
9479 +/* toi_file_save_config_info
9480 + *
9481 + * Description:        Save the target's name, not for resume time, but for
9482 + *             all_settings.
9483 + * Arguments:  Buffer:         Pointer to a buffer of size PAGE_SIZE.
9484 + * Returns:    Number of bytes used for saving our data.
9485 + */
9486 +
9487 +static int toi_file_save_config_info(char *buffer)
9488 +{
9489 +       strcpy(buffer, toi_file_target);
9490 +       return strlen(toi_file_target) + 1;
9491 +}
9492 +
9493 +/* toi_file_load_config_info
9494 + *
9495 + * Description:        Reload target's name.
9496 + * Arguments:  Buffer:         Pointer to the start of the data.
9497 + *             Size:           Number of bytes that were saved.
9498 + */
9499 +
9500 +static void toi_file_load_config_info(char *buffer, int size)
9501 +{
9502 +       strcpy(toi_file_target, buffer);
9503 +}
9504 +
9505 +static int toi_file_initialise(int starting_cycle)
9506 +{
9507 +       if (starting_cycle) {
9508 +               if (toiActiveAllocator != &toi_fileops)
9509 +                       return 0;
9510 +
9511 +               if (starting_cycle & SYSFS_HIBERNATE && !*toi_file_target) {
9512 +                       printk(KERN_INFO "FileAllocator is the active writer,  "
9513 +                                       "but no filename has been set.\n");
9514 +                       return 1;
9515 +               }
9516 +       }
9517 +
9518 +       if (*toi_file_target)
9519 +               toi_file_get_target_info(toi_file_target, starting_cycle, 0);
9520 +
9521 +       if (starting_cycle && (toi_file_image_exists(1) == -1)) {
9522 +               printk("%s is does not have a valid signature for "
9523 +                               "hibernating.\n", toi_file_target);
9524 +               return 1;
9525 +       }
9526 +
9527 +       return 0;
9528 +}
9529 +
9530 +static struct toi_sysfs_data sysfs_params[] = {
9531 +
9532 +       {
9533 +        TOI_ATTR("target", SYSFS_RW),
9534 +        SYSFS_STRING(toi_file_target, 256, SYSFS_NEEDS_SM_FOR_WRITE),
9535 +        .write_side_effect             = test_toi_file_target,
9536 +       },
9537 +
9538 +       {
9539 +         TOI_ATTR("enabled", SYSFS_RW),
9540 +         SYSFS_INT(&toi_fileops.enabled, 0, 1, 0),
9541 +         .write_side_effect            = attempt_to_parse_resume_device2,
9542 +       }
9543 +};
9544 +
9545 +static struct toi_module_ops toi_fileops = {
9546 +       .type                                   = WRITER_MODULE,
9547 +       .name                                   = "file storage",
9548 +       .directory                              = "file",
9549 +       .module                                 = THIS_MODULE,
9550 +       .print_debug_info                       = toi_file_print_debug_stats,
9551 +       .save_config_info                       = toi_file_save_config_info,
9552 +       .load_config_info                       = toi_file_load_config_info,
9553 +       .storage_needed                         = toi_file_storage_needed,
9554 +       .initialise                             = toi_file_initialise,
9555 +       .cleanup                                = toi_file_cleanup,
9556 +
9557 +       .noresume_reset         = toi_file_noresume_reset,
9558 +       .storage_available      = toi_file_storage_available,
9559 +       .storage_allocated      = toi_file_storage_allocated,
9560 +       .release_storage        = toi_file_release_storage,
9561 +       .reserve_header_space   = toi_file_reserve_header_space,
9562 +       .allocate_storage       = toi_file_allocate_storage,
9563 +       .image_exists           = toi_file_image_exists,
9564 +       .mark_resume_attempted  = toi_file_mark_resume_attempted,
9565 +       .write_header_init      = toi_file_write_header_init,
9566 +       .write_header_cleanup   = toi_file_write_header_cleanup,
9567 +       .read_header_init       = toi_file_read_header_init,
9568 +       .read_header_cleanup    = toi_file_read_header_cleanup,
9569 +       .remove_image           = toi_file_remove_image,
9570 +       .parse_sig_location     = toi_file_parse_sig_location,
9571 +
9572 +       .sysfs_data             = sysfs_params,
9573 +       .num_sysfs_entries      = sizeof(sysfs_params) /
9574 +               sizeof(struct toi_sysfs_data),
9575 +};
9576 +
9577 +/* ---- Registration ---- */
9578 +static __init int toi_file_load(void)
9579 +{
9580 +       toi_fileops.rw_init = toi_bio_ops.rw_init;
9581 +       toi_fileops.rw_cleanup = toi_bio_ops.rw_cleanup;
9582 +       toi_fileops.read_page = toi_bio_ops.read_page;
9583 +       toi_fileops.write_page = toi_bio_ops.write_page;
9584 +       toi_fileops.rw_header_chunk = toi_bio_ops.rw_header_chunk;
9585 +       toi_fileops.rw_header_chunk_noreadahead =
9586 +               toi_bio_ops.rw_header_chunk_noreadahead;
9587 +       toi_fileops.io_flusher = toi_bio_ops.io_flusher;
9588 +
9589 +       return toi_register_module(&toi_fileops);
9590 +}
9591 +
9592 +#ifdef MODULE
9593 +static __exit void toi_file_unload(void)
9594 +{
9595 +       toi_unregister_module(&toi_fileops);
9596 +}
9597 +
9598 +module_init(toi_file_load);
9599 +module_exit(toi_file_unload);
9600 +MODULE_LICENSE("GPL");
9601 +MODULE_AUTHOR("Nigel Cunningham");
9602 +MODULE_DESCRIPTION("TuxOnIce FileAllocator");
9603 +#else
9604 +late_initcall(toi_file_load);
9605 +#endif
9606 diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
9607 new file mode 100644
9608 index 0000000..f362017
9609 --- /dev/null
9610 +++ b/kernel/power/tuxonice_highlevel.c
9611 @@ -0,0 +1,1329 @@
9612 +/*
9613 + * kernel/power/tuxonice_highlevel.c
9614 + */
9615 +/** \mainpage TuxOnIce.
9616 + *
9617 + * TuxOnIce provides support for saving and restoring an image of
9618 + * system memory to an arbitrary storage device, either on the local computer,
9619 + * or across some network. The support is entirely OS based, so TuxOnIce
9620 + * works without requiring BIOS, APM or ACPI support. The vast majority of the
9621 + * code is also architecture independant, so it should be very easy to port
9622 + * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
9623 + * and preemption. Initramfses and initrds are also supported.
9624 + *
9625 + * TuxOnIce uses a modular design, in which the method of storing the image is
9626 + * completely abstracted from the core code, as are transformations on the data
9627 + * such as compression and/or encryption (multiple 'modules' can be used to
9628 + * provide arbitrary combinations of functionality). The user interface is also
9629 + * modular, so that arbitrarily simple or complex interfaces can be used to
9630 + * provide anything from debugging information through to eye candy.
9631 + *
9632 + * \section Copyright
9633 + *
9634 + * TuxOnIce is released under the GPLv2.
9635 + *
9636 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
9637 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
9638 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
9639 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)<BR>
9640 + *
9641 + * \section Credits
9642 + *
9643 + * Nigel would like to thank the following people for their work:
9644 + *
9645 + * Bernard Blackham <bernard@blackham.com.au><BR>
9646 + * Web page & Wiki administration, some coding. A person without whom
9647 + * TuxOnIce would not be where it is.
9648 + *
9649 + * Michael Frank <mhf@linuxmail.org><BR>
9650 + * Extensive testing and help with improving stability. I was constantly
9651 + * amazed by the quality and quantity of Michael's help.
9652 + *
9653 + * Pavel Machek <pavel@ucw.cz><BR>
9654 + * Modifications, defectiveness pointing, being with Gabor at the very
9655 + * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
9656 + * 2.5.17. Even though Pavel and I disagree on the direction suspend to
9657 + * disk should take, I appreciate the valuable work he did in helping Gabor
9658 + * get the concept working.
9659 + *
9660 + * ..and of course the myriads of TuxOnIce users who have helped diagnose
9661 + * and fix bugs, made suggestions on how to improve the code, proofread
9662 + * documentation, and donated time and money.
9663 + *
9664 + * Thanks also to corporate sponsors:
9665 + *
9666 + * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
9667 + *
9668 + * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
9669 + * allowed him to work on TuxOnIce and PM related issues on company time.
9670 + *
9671 + * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
9672 + * 2003 to Jan 2004.
9673 + *
9674 + * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
9675 + * maintenance of SMP and Highmem support.
9676 + *
9677 + * <B>OSDL.</B> Provided access to various hardware configurations, make
9678 + * occasional small donations to the project.
9679 + */
9680 +
9681 +#include <linux/suspend.h>
9682 +#include <linux/module.h>
9683 +#include <linux/freezer.h>
9684 +#include <linux/utsrelease.h>
9685 +#include <linux/cpu.h>
9686 +#include <linux/console.h>
9687 +#include <linux/writeback.h>
9688 +#include <asm/uaccess.h>
9689 +
9690 +#include "tuxonice_modules.h"
9691 +#include "tuxonice_sysfs.h"
9692 +#include "tuxonice_prepare_image.h"
9693 +#include "tuxonice_io.h"
9694 +#include "tuxonice_ui.h"
9695 +#include "tuxonice_power_off.h"
9696 +#include "tuxonice_storage.h"
9697 +#include "tuxonice_checksum.h"
9698 +#include "tuxonice_cluster.h"
9699 +#include "tuxonice_builtin.h"
9700 +#include "tuxonice_atomic_copy.h"
9701 +#include "tuxonice_alloc.h"
9702 +
9703 +/*! Pageset metadata. */
9704 +struct pagedir pagedir2 = {2};
9705 +
9706 +static int get_pmsem = 0, got_pmsem;
9707 +static mm_segment_t oldfs;
9708 +static atomic_t actions_running;
9709 +static int block_dump_save;
9710 +static char pre_hibernate_command[256];
9711 +static char post_hibernate_command[256];
9712 +
9713 +char *tuxonice_signature = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
9714 +
9715 +int toi_fail_num;
9716 +
9717 +int do_toi_step(int step);
9718 +
9719 +unsigned long boot_kernel_data_buffer;
9720 +
9721 +/**
9722 + * toi_finish_anything - Cleanup after doing anything.
9723 + *
9724 + * @toi_or_resume: Whether finishing a cycle or attempt at resuming.
9725 + *
9726 + * This is our basic clean-up routine, matching start_anything below. We
9727 + * call cleanup routines, drop module references and restore process fs and
9728 + * cpus allowed masks, together with the global block_dump variable's value.
9729 + */
9730 +void toi_finish_anything(int hibernate_or_resume)
9731 +{
9732 +       if (!atomic_dec_and_test(&actions_running))
9733 +               return;
9734 +
9735 +       toi_cleanup_modules(hibernate_or_resume);
9736 +       toi_put_modules();
9737 +       set_fs(oldfs);
9738 +       if (hibernate_or_resume) {
9739 +               block_dump = block_dump_save;
9740 +               set_cpus_allowed(current, CPU_MASK_ALL);
9741 +               toi_alloc_print_debug_stats();
9742 +
9743 +               if (hibernate_or_resume == SYSFS_HIBERNATE &&
9744 +                               strlen(post_hibernate_command))
9745 +                       toi_launch_userspace_program(post_hibernate_command,
9746 +                                       0, UMH_WAIT_PROC);
9747 +       }
9748 +}
9749 +
9750 +/**
9751 + * toi_start_anything - Basic initialisation for TuxOnIce.
9752 + *
9753 + * @toi_or_resume: Whether starting a cycle or attempt at resuming.
9754 + *
9755 + * Our basic initialisation routine. Take references on modules, use the
9756 + * kernel segment, recheck resume= if no active allocator is set, initialise
9757 + * modules, save and reset block_dump and ensure we're running on CPU0.
9758 + */
9759 +int toi_start_anything(int hibernate_or_resume)
9760 +{
9761 +       if (atomic_add_return(1, &actions_running) != 1) {
9762 +               if (hibernate_or_resume) {
9763 +                       printk(KERN_INFO "Can't start a cycle when actions are "
9764 +                                       "already running.\n");
9765 +                       atomic_dec(&actions_running);
9766 +                       return -EBUSY;
9767 +               } else
9768 +                       return 0;
9769 +       }
9770 +
9771 +       oldfs = get_fs();
9772 +       set_fs(KERNEL_DS);
9773 +
9774 +       if (hibernate_or_resume == SYSFS_HIBERNATE &&
9775 +                       strlen(pre_hibernate_command)) {
9776 +               int result = toi_launch_userspace_program(pre_hibernate_command,
9777 +                               0, UMH_WAIT_PROC);
9778 +               if (result) {
9779 +                       printk("Pre-hibernate command '%s' returned %d. "
9780 +                                       "Aborting.\n", pre_hibernate_command,
9781 +                                       result);
9782 +                       goto out_err;
9783 +               }
9784 +       }
9785 +
9786 +       if (hibernate_or_resume == SYSFS_HIBERNATE)
9787 +               toi_print_modules();
9788 +
9789 +       if (toi_get_modules()) {
9790 +               printk("TuxOnIce: Get modules failed!\n");
9791 +               goto out_err;
9792 +       }
9793 +
9794 +       if (hibernate_or_resume) {
9795 +               block_dump_save = block_dump;
9796 +               block_dump = 0;
9797 +               set_cpus_allowed(current,
9798 +                               cpumask_of_cpu(first_cpu(cpu_online_map)));
9799 +       }
9800 +
9801 +       if (toi_initialise_modules_early(hibernate_or_resume))
9802 +               goto out_err;
9803 +
9804 +       if (!toiActiveAllocator)
9805 +               toi_attempt_to_parse_resume_device(!hibernate_or_resume);
9806 +
9807 +       if (toi_initialise_modules_late(hibernate_or_resume))
9808 +               goto out_err;
9809 +
9810 +       return 0;
9811 +
9812 +out_err:
9813 +       if (hibernate_or_resume)
9814 +               block_dump_save = block_dump;
9815 +       toi_finish_anything(hibernate_or_resume);
9816 +       return -EBUSY;
9817 +}
9818 +
9819 +/*
9820 + * Nosave page tracking.
9821 + *
9822 + * Here rather than in prepare_image because we want to do it once only at the
9823 + * start of a cycle.
9824 + */
9825 +
9826 +/**
9827 + * mark_nosave_pages - Set up our Nosave bitmap.
9828 + *
9829 + * Build a bitmap of Nosave pages from the list. The bitmap allows faster
9830 + * use when preparing the image.
9831 + */
9832 +static void mark_nosave_pages(void)
9833 +{
9834 +       struct nosave_region *region;
9835 +
9836 +       list_for_each_entry(region, &nosave_regions, list) {
9837 +               unsigned long pfn;
9838 +
9839 +               for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
9840 +                       SetPageNosave(pfn_to_page(pfn));
9841 +       }
9842 +}
9843 +
9844 +/**
9845 + * allocate_bitmaps: Allocate bitmaps used to record page states.
9846 + *
9847 + * Allocate the bitmaps we use to record the various TuxOnIce related
9848 + * page states.
9849 + */
9850 +static int allocate_bitmaps(void)
9851 +{
9852 +       if (allocate_dyn_pageflags(&pageset1_map, 0) ||
9853 +           allocate_dyn_pageflags(&pageset1_copy_map, 0) ||
9854 +           allocate_dyn_pageflags(&pageset2_map, 0) ||
9855 +           allocate_dyn_pageflags(&io_map, 0) ||
9856 +           allocate_dyn_pageflags(&nosave_map, 0) ||
9857 +           allocate_dyn_pageflags(&free_map, 0) ||
9858 +           allocate_dyn_pageflags(&page_resave_map, 0))
9859 +               return 1;
9860 +
9861 +       return 0;
9862 +}
9863 +
9864 +/**
9865 + * free_bitmaps: Free the bitmaps used to record page states.
9866 + *
9867 + * Free the bitmaps allocated above. It is not an error to call
9868 + * free_dyn_pageflags on a bitmap that isn't currentyl allocated.
9869 + */
9870 +static void free_bitmaps(void)
9871 +{
9872 +       free_dyn_pageflags(&pageset1_map);
9873 +       free_dyn_pageflags(&pageset1_copy_map);
9874 +       free_dyn_pageflags(&pageset2_map);
9875 +       free_dyn_pageflags(&io_map);
9876 +       free_dyn_pageflags(&nosave_map);
9877 +       free_dyn_pageflags(&free_map);
9878 +       free_dyn_pageflags(&page_resave_map);
9879 +}
9880 +
9881 +/**
9882 + * io_MB_per_second: Return the number of MB/s read or written.
9883 + *
9884 + * @write: Whether to return the speed at which we wrote.
9885 + *
9886 + * Calculate the number of megabytes per second that were read or written.
9887 + */
9888 +static int io_MB_per_second(int write)
9889 +{
9890 +       return (toi_bkd.toi_io_time[write][1]) ?
9891 +               MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
9892 +               toi_bkd.toi_io_time[write][1] : 0;
9893 +}
9894 +
9895 +/**
9896 + * get_debug_info: Fill a buffer with debugging information.
9897 + *
9898 + * @buffer: The buffer to be filled.
9899 + * @count: The size of the buffer, in bytes.
9900 + *
9901 + * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
9902 + * either printk or return via sysfs.
9903 + */
9904 +#define SNPRINTF(a...)         len += snprintf_used(((char *)buffer) + len, \
9905 +               count - len - 1, ## a)
9906 +static int get_toi_debug_info(const char *buffer, int count)
9907 +{
9908 +       int len = 0;
9909 +
9910 +       SNPRINTF("TuxOnIce debugging info:\n");
9911 +       SNPRINTF("- TuxOnIce core  : " TOI_CORE_VERSION "\n");
9912 +       SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
9913 +       SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
9914 +       SNPRINTF("- Attempt number : %d\n", nr_hibernates);
9915 +       SNPRINTF("- Parameters     : %ld %ld %ld %d %d %ld\n",
9916 +                       toi_result,
9917 +                       toi_bkd.toi_action,
9918 +                       toi_bkd.toi_debug_state,
9919 +                       toi_bkd.toi_default_console_level,
9920 +                       image_size_limit,
9921 +                       toi_poweroff_method);
9922 +       SNPRINTF("- Overall expected compression percentage: %d.\n",
9923 +                       100 - toi_expected_compression_ratio());
9924 +       len += toi_print_module_debug_info(((char *) buffer) + len,
9925 +                       count - len - 1);
9926 +       if (toi_bkd.toi_io_time[0][1]) {
9927 +               if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
9928 +                       SNPRINTF("- I/O speed: Write %d KB/s",
9929 +                         (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
9930 +                         toi_bkd.toi_io_time[0][1]));
9931 +                       if (toi_bkd.toi_io_time[1][1])
9932 +                               SNPRINTF(", Read %d KB/s",
9933 +                                 (KB((unsigned long)
9934 +                                     toi_bkd.toi_io_time[1][0]) * HZ /
9935 +                                 toi_bkd.toi_io_time[1][1]));
9936 +               } else {
9937 +                       SNPRINTF("- I/O speed: Write %d MB/s",
9938 +                        (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
9939 +                         toi_bkd.toi_io_time[0][1]));
9940 +                       if (toi_bkd.toi_io_time[1][1])
9941 +                               SNPRINTF(", Read %d MB/s",
9942 +                                (MB((unsigned long)
9943 +                                    toi_bkd.toi_io_time[1][0]) * HZ /
9944 +                                 toi_bkd.toi_io_time[1][1]));
9945 +               }
9946 +               SNPRINTF(".\n");
9947 +       } else
9948 +               SNPRINTF("- No I/O speed stats available.\n");
9949 +       SNPRINTF("- Extra pages    : %ld used/%ld.\n",
9950 +                       extra_pd1_pages_used, extra_pd1_pages_allowance);
9951 +
9952 +       return len;
9953 +}
9954 +
9955 +/**
9956 + * do_cleanup: Cleanup after attempting to hibernate or resume.
9957 + *
9958 + * @get_debug_info: Whether to allocate and return debugging info.
9959 + *
9960 + * Cleanup after attempting to hibernate or resume, possibly getting
9961 + * debugging info as we do so.
9962 + */
9963 +static void do_cleanup(int get_debug_info)
9964 +{
9965 +       int i = 0;
9966 +       char *buffer = NULL;
9967 +
9968 +       if (get_debug_info)
9969 +               toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
9970 +       relink_lru_lists();
9971 +
9972 +       free_checksum_pages();
9973 +
9974 +       if (get_debug_info)
9975 +               buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
9976 +
9977 +       if (buffer)
9978 +               i = get_toi_debug_info(buffer, PAGE_SIZE);
9979 +
9980 +       toi_free_extra_pagedir_memory();
9981 +
9982 +       pagedir1.size = pagedir2.size = 0;
9983 +       set_highmem_size(pagedir1, 0);
9984 +       set_highmem_size(pagedir2, 0);
9985 +
9986 +       if (boot_kernel_data_buffer) {
9987 +               toi_free_page(37, boot_kernel_data_buffer);
9988 +               boot_kernel_data_buffer = 0;
9989 +       }
9990 +
9991 +       if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
9992 +               pm_notifier_call_chain(PM_POST_HIBERNATION);
9993 +               clear_toi_state(TOI_NOTIFIERS_PREPARE);
9994 +       }
9995 +
9996 +       thaw_processes();
9997 +
9998 +#ifdef CONFIG_TOI_KEEP_IMAGE
9999 +       if (test_action_state(TOI_KEEP_IMAGE) &&
10000 +           !test_result_state(TOI_ABORTED)) {
10001 +               toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
10002 +                       "TuxOnIce: Not invalidating the image due "
10003 +                       "to Keep Image being enabled.\n");
10004 +               set_result_state(TOI_KEPT_IMAGE);
10005 +       } else
10006 +#endif
10007 +               if (toiActiveAllocator)
10008 +                       toiActiveAllocator->remove_image();
10009 +
10010 +       free_bitmaps();
10011 +
10012 +       if (buffer && i) {
10013 +               /* Printk can only handle 1023 bytes, including
10014 +                * its level mangling. */
10015 +               for (i = 0; i < 3; i++)
10016 +                       printk("%s", buffer + (1023 * i));
10017 +               toi_free_page(20, (unsigned long) buffer);
10018 +       }
10019 +
10020 +       if (!test_action_state(TOI_LATE_CPU_HOTPLUG))
10021 +               enable_nonboot_cpus();
10022 +       toi_cleanup_console();
10023 +
10024 +       free_attention_list();
10025 +
10026 +       toi_deactivate_storage(0);
10027 +
10028 +       clear_toi_state(TOI_IGNORE_LOGLEVEL);
10029 +       clear_toi_state(TOI_TRYING_TO_RESUME);
10030 +       clear_toi_state(TOI_NOW_RESUMING);
10031 +
10032 +       if (got_pmsem) {
10033 +               mutex_unlock(&pm_mutex);
10034 +               got_pmsem = 0;
10035 +       }
10036 +}
10037 +
10038 +/**
10039 + * check_still_keeping_image: We kept an image; check whether to reuse it.
10040 + *
10041 + * We enter this routine when we have kept an image. If the user has said they
10042 + * want to still keep it, all we need to do is powerdown. If powering down
10043 + * means hibernating to ram and the power doesn't run out, we'll return 1.
10044 + * If we do power off properly or the battery runs out, we'll resume via the
10045 + * normal paths.
10046 + *
10047 + * If the user has said they want to remove the previously kept image, we
10048 + * remove it, and return 0. We'll then store a new image.
10049 + */
10050 +static int check_still_keeping_image(void)
10051 +{
10052 +       if (test_action_state(TOI_KEEP_IMAGE)) {
10053 +               printk("Image already stored: powering down immediately.");
10054 +               do_toi_step(STEP_HIBERNATE_POWERDOWN);
10055 +               return 1;       /* Just in case we're using S3 */
10056 +       }
10057 +
10058 +       printk("Invalidating previous image.\n");
10059 +       toiActiveAllocator->remove_image();
10060 +
10061 +       return 0;
10062 +}
10063 +
10064 +/**
10065 + * toi_init: Prepare to hibernate to disk.
10066 + *
10067 + * Initialise variables & data structures, in preparation for
10068 + * hibernating to disk.
10069 + */
10070 +static int toi_init(void)
10071 +{
10072 +       int result;
10073 +
10074 +       toi_result = 0;
10075 +
10076 +       printk(KERN_INFO "Initiating a hibernation cycle.\n");
10077 +
10078 +       nr_hibernates++;
10079 +
10080 +       toi_bkd.toi_io_time[0][0] = toi_bkd.toi_io_time[0][1] =
10081 +               toi_bkd.toi_io_time[1][0] =     toi_bkd.toi_io_time[1][1] = 0;
10082 +
10083 +       if (!test_toi_state(TOI_CAN_HIBERNATE) ||
10084 +           allocate_bitmaps())
10085 +               return 1;
10086 +
10087 +       mark_nosave_pages();
10088 +
10089 +       toi_prepare_console();
10090 +
10091 +       result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
10092 +       if (result) {
10093 +               set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
10094 +               return 1;
10095 +       }
10096 +       set_toi_state(TOI_NOTIFIERS_PREPARE);
10097 +
10098 +       boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
10099 +       if (!boot_kernel_data_buffer) {
10100 +               printk(KERN_ERR "TuxOnIce: Failed to allocate "
10101 +                               "boot_kernel_data_buffer.\n");
10102 +               set_result_state(TOI_OUT_OF_MEMORY);
10103 +               return 1;
10104 +       }
10105 +
10106 +       if (test_action_state(TOI_LATE_CPU_HOTPLUG) ||
10107 +                       !disable_nonboot_cpus())
10108 +               return 1;
10109 +
10110 +       set_abort_result(TOI_CPU_HOTPLUG_FAILED);
10111 +       return 0;
10112 +}
10113 +
10114 +/**
10115 + * can_hibernate: Perform basic 'Can we hibernate?' tests.
10116 + *
10117 + * Perform basic tests that must pass if we're going to be able to hibernate:
10118 + * Can we get the pm_mutex? Is resume= valid (we need to know where to write
10119 + * the image header).
10120 + */
10121 +static int can_hibernate(void)
10122 +{
10123 +       if (get_pmsem) {
10124 +               if (!mutex_trylock(&pm_mutex)) {
10125 +                       printk(KERN_INFO "TuxOnIce: Failed to obtain "
10126 +                                       "pm_mutex.\n");
10127 +                       dump_stack();
10128 +                       set_abort_result(TOI_PM_SEM);
10129 +                       return 0;
10130 +               }
10131 +               got_pmsem = 1;
10132 +       }
10133 +
10134 +       if (!test_toi_state(TOI_CAN_HIBERNATE))
10135 +               toi_attempt_to_parse_resume_device(0);
10136 +
10137 +       if (!test_toi_state(TOI_CAN_HIBERNATE)) {
10138 +               printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
10139 +                       "This may be because you haven't put something along "
10140 +                       "the lines of\n\nresume=swap:/dev/hda1\n\n"
10141 +                       "in lilo.conf or equivalent. (Where /dev/hda1 is your "
10142 +                       "swap partition).\n");
10143 +               set_abort_result(TOI_CANT_SUSPEND);
10144 +               if (!got_pmsem) {
10145 +                       mutex_unlock(&pm_mutex);
10146 +                       got_pmsem = 0;
10147 +               }
10148 +               return 0;
10149 +       }
10150 +
10151 +       return 1;
10152 +}
10153 +
10154 +/**
10155 + * do_post_image_write: Having written an image, figure out what to do next.
10156 + *
10157 + * After writing an image, we might load an alternate image or power down.
10158 + * Powering down might involve hibernating to ram, in which case we also
10159 + * need to handle reloading pageset2.
10160 + */
10161 +static int do_post_image_write(void)
10162 +{
10163 +       /* If switching images fails, do normal powerdown */
10164 +       if (alt_resume_param[0])
10165 +               do_toi_step(STEP_RESUME_ALT_IMAGE);
10166 +
10167 +       toi_cond_pause(1, "About to power down or reboot.");
10168 +       toi_power_down();
10169 +
10170 +       /* If we return, it's because we hibernated to ram */
10171 +       if (read_pageset2(1))
10172 +               panic("Attempt to reload pagedir 2 failed. Try rebooting.");
10173 +
10174 +       barrier();
10175 +       mb();
10176 +       do_cleanup(1);
10177 +       return 0;
10178 +}
10179 +
10180 +/**
10181 + * __save_image: Do the hard work of saving the image.
10182 + *
10183 + * High level routine for getting the image saved. The key assumptions made
10184 + * are that processes have been frozen and sufficient memory is available.
10185 + *
10186 + * We also exit through here at resume time, coming back from toi_hibernate
10187 + * after the atomic restore. This is the reason for the toi_in_hibernate
10188 + * test.
10189 + */
10190 +static int __save_image(void)
10191 +{
10192 +       int temp_result, did_copy = 0;
10193 +
10194 +       toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
10195 +
10196 +       toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
10197 +               " - Final values: %d and %d.\n",
10198 +               pagedir1.size, pagedir2.size);
10199 +
10200 +       toi_cond_pause(1, "About to write pagedir2.");
10201 +
10202 +       temp_result = write_pageset(&pagedir2);
10203 +
10204 +       if (temp_result == -1 || test_result_state(TOI_ABORTED))
10205 +               return 1;
10206 +
10207 +       toi_cond_pause(1, "About to copy pageset 1.");
10208 +
10209 +       if (test_result_state(TOI_ABORTED))
10210 +               return 1;
10211 +
10212 +       toi_deactivate_storage(1);
10213 +
10214 +       toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy.");
10215 +
10216 +       toi_in_hibernate = 1;
10217 +
10218 +       if (toi_go_atomic(PMSG_FREEZE, 1))
10219 +               goto Failed;
10220 +
10221 +       temp_result = toi_hibernate();
10222 +       if (!temp_result)
10223 +               did_copy = 1;
10224 +
10225 +       /* We return here at resume time too! */
10226 +       toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate);
10227 +
10228 +Failed:
10229 +       if (toi_activate_storage(1))
10230 +               panic("Failed to reactivate our storage.");
10231 +
10232 +       /* Resume time? */
10233 +       if (!toi_in_hibernate) {
10234 +               copyback_post();
10235 +               return 0;
10236 +       }
10237 +
10238 +       /* Nope. Hibernating. So, see if we can save the image... */
10239 +
10240 +       if (temp_result || test_result_state(TOI_ABORTED)) {
10241 +               if (did_copy)
10242 +                       goto abort_reloading_pagedir_two;
10243 +               else
10244 +                       return 1;
10245 +       }
10246 +
10247 +       toi_update_status(pagedir2.size,
10248 +                       pagedir1.size + pagedir2.size,
10249 +                       NULL);
10250 +
10251 +       if (test_result_state(TOI_ABORTED))
10252 +               goto abort_reloading_pagedir_two;
10253 +
10254 +       toi_cond_pause(1, "About to write pageset1.");
10255 +
10256 +       toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
10257 +                       "-- Writing pageset1\n");
10258 +
10259 +       temp_result = write_pageset(&pagedir1);
10260 +
10261 +       /* We didn't overwrite any memory, so no reread needs to be done. */
10262 +       if (test_action_state(TOI_TEST_FILTER_SPEED))
10263 +               return 1;
10264 +
10265 +       if (temp_result == 1 || test_result_state(TOI_ABORTED))
10266 +               goto abort_reloading_pagedir_two;
10267 +
10268 +       toi_cond_pause(1, "About to write header.");
10269 +
10270 +       if (test_result_state(TOI_ABORTED))
10271 +               goto abort_reloading_pagedir_two;
10272 +
10273 +       temp_result = write_image_header();
10274 +
10275 +       if (test_action_state(TOI_TEST_BIO))
10276 +               return 1;
10277 +
10278 +       if (!temp_result && !test_result_state(TOI_ABORTED))
10279 +               return 0;
10280 +
10281 +abort_reloading_pagedir_two:
10282 +       temp_result = read_pageset2(1);
10283 +
10284 +       /* If that failed, we're sunk. Panic! */
10285 +       if (temp_result)
10286 +               panic("Attempt to reload pagedir 2 while aborting "
10287 +                               "a hibernate failed.");
10288 +
10289 +       return 1;
10290 +}
10291 +
10292 +/**
10293 + * do_save_image: Save the image and handle the result.
10294 + *
10295 + * Save the prepared image. If we fail or we're in the path returning
10296 + * from the atomic restore, cleanup.
10297 + */
10298 +
10299 +static int do_save_image(void)
10300 +{
10301 +       int result = __save_image();
10302 +       if (!toi_in_hibernate || result)
10303 +               do_cleanup(1);
10304 +       return result;
10305 +}
10306 +
10307 +
10308 +/**
10309 + * do_prepare_image: Try to prepare an image.
10310 + *
10311 + * Seek to initialise and prepare an image to be saved. On failure,
10312 + * cleanup.
10313 + */
10314 +
10315 +static int do_prepare_image(void)
10316 +{
10317 +       if (toi_activate_storage(0))
10318 +               return 1;
10319 +
10320 +       /*
10321 +        * If kept image and still keeping image and hibernating to RAM, we will
10322 +        * return 1 after hibernating and resuming (provided the power doesn't
10323 +        * run out. In that case, we skip directly to cleaning up and exiting.
10324 +        */
10325 +
10326 +       if (!can_hibernate() ||
10327 +           (test_result_state(TOI_KEPT_IMAGE) &&
10328 +            check_still_keeping_image()))
10329 +               goto cleanup;
10330 +
10331 +       if (toi_init() && !toi_prepare_image() &&
10332 +                       !test_result_state(TOI_ABORTED))
10333 +               return 0;
10334 +
10335 +cleanup:
10336 +       do_cleanup(0);
10337 +       return 1;
10338 +}
10339 +
10340 +/**
10341 + * do_check_can_resume: Find out whether an image has been stored.
10342 + *
10343 + * Read whether an image exists. We use the same routine as the
10344 + * image_exists sysfs entry, and just look to see whether the
10345 + * first character in the resulting buffer is a '1'.
10346 + */
10347 +int do_check_can_resume(void)
10348 +{
10349 +       char *buf = (char *) toi_get_zeroed_page(21, TOI_ATOMIC_GFP);
10350 +       int result = 0;
10351 +
10352 +       if (!buf)
10353 +               return 0;
10354 +
10355 +       /* Only interested in first byte, so throw away return code. */
10356 +       image_exists_read(buf, PAGE_SIZE);
10357 +
10358 +       if (buf[0] == '1')
10359 +               result = 1;
10360 +
10361 +       toi_free_page(21, (unsigned long) buf);
10362 +       return result;
10363 +}
10364 +
10365 +/**
10366 + * do_load_atomic_copy: Load the first part of an image, if it exists.
10367 + *
10368 + * Check whether we have an image. If one exists, do sanity checking
10369 + * (possibly invalidating the image or even rebooting if the user
10370 + * requests that) before loading it into memory in preparation for the
10371 + * atomic restore.
10372 + *
10373 + * If and only if we have an image loaded and ready to restore, we return 1.
10374 + */
10375 +static int do_load_atomic_copy(void)
10376 +{
10377 +       int read_image_result = 0;
10378 +
10379 +       if (sizeof(swp_entry_t) != sizeof(long)) {
10380 +               printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
10381 +                       " of long. Please report this!\n");
10382 +               return 1;
10383 +       }
10384 +
10385 +       if (!resume_file[0])
10386 +               printk(KERN_WARNING "TuxOnIce: "
10387 +                       "You need to use a resume= command line parameter to "
10388 +                       "tell TuxOnIce where to look for an image.\n");
10389 +
10390 +       toi_activate_storage(0);
10391 +
10392 +       if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
10393 +               !toi_attempt_to_parse_resume_device(0)) {
10394 +               /*
10395 +                * Without a usable storage device we can do nothing -
10396 +                * even if noresume is given
10397 +                */
10398 +
10399 +               if (!toiNumAllocators)
10400 +                       printk(KERN_ALERT "TuxOnIce: "
10401 +                         "No storage allocators have been registered.\n");
10402 +               else
10403 +                       printk(KERN_ALERT "TuxOnIce: "
10404 +                               "Missing or invalid storage location "
10405 +                               "(resume= parameter). Please correct and "
10406 +                               "rerun lilo (or equivalent) before "
10407 +                               "hibernating.\n");
10408 +               toi_deactivate_storage(0);
10409 +               return 1;
10410 +       }
10411 +
10412 +       read_image_result = read_pageset1(); /* non fatal error ignored */
10413 +
10414 +       if (test_toi_state(TOI_NORESUME_SPECIFIED))
10415 +               clear_toi_state(TOI_NORESUME_SPECIFIED);
10416 +
10417 +       toi_deactivate_storage(0);
10418 +
10419 +       if (read_image_result)
10420 +               return 1;
10421 +
10422 +       return 0;
10423 +}
10424 +
10425 +/**
10426 + * prepare_restore_load_alt_image: Save & restore alt image variables.
10427 + *
10428 + * Save and restore the pageset1 maps, when loading an alternate image.
10429 + */
10430 +static void prepare_restore_load_alt_image(int prepare)
10431 +{
10432 +       static struct dyn_pageflags pageset1_map_save, pageset1_copy_map_save;
10433 +
10434 +       if (prepare) {
10435 +               memcpy(&pageset1_map_save, &pageset1_map,
10436 +                               sizeof(struct dyn_pageflags));
10437 +               pageset1_map.bitmap = NULL;
10438 +               pageset1_map.sparse = 0;
10439 +               pageset1_map.initialised = 0;
10440 +               memcpy(&pageset1_copy_map_save, &pageset1_copy_map,
10441 +                       sizeof(struct dyn_pageflags));
10442 +               pageset1_copy_map.bitmap = NULL;
10443 +               pageset1_copy_map.sparse = 0;
10444 +               pageset1_copy_map.initialised = 0;
10445 +               set_toi_state(TOI_LOADING_ALT_IMAGE);
10446 +               toi_reset_alt_image_pageset2_pfn();
10447 +       } else {
10448 +               if (pageset1_map.bitmap)
10449 +                       free_dyn_pageflags(&pageset1_map);
10450 +               memcpy(&pageset1_map, &pageset1_map_save,
10451 +                       sizeof(struct dyn_pageflags));
10452 +               if (pageset1_copy_map.bitmap)
10453 +                       free_dyn_pageflags(&pageset1_copy_map);
10454 +               memcpy(&pageset1_copy_map, &pageset1_copy_map_save,
10455 +                       sizeof(struct dyn_pageflags));
10456 +               clear_toi_state(TOI_NOW_RESUMING);
10457 +               clear_toi_state(TOI_LOADING_ALT_IMAGE);
10458 +       }
10459 +}
10460 +
10461 +/**
10462 + * pre_resume_freeze: Freeze the system, before doing an atomic restore.
10463 + *
10464 + * Hot unplug cpus (if we didn't do it early) and freeze processes, in
10465 + * preparation for doing an atomic restore.
10466 + */
10467 +int pre_resume_freeze(void)
10468 +{
10469 +       if (!test_action_state(TOI_LATE_CPU_HOTPLUG)) {
10470 +               toi_prepare_status(DONT_CLEAR_BAR, "Disable nonboot cpus.");
10471 +               if (disable_nonboot_cpus()) {
10472 +                       set_abort_result(TOI_CPU_HOTPLUG_FAILED);
10473 +                       return 1;
10474 +               }
10475 +       }
10476 +
10477 +       toi_prepare_status(DONT_CLEAR_BAR,      "Freeze processes.");
10478 +
10479 +       if (freeze_processes()) {
10480 +               printk("Some processes failed to stop.\n");
10481 +               return 1;
10482 +       }
10483 +
10484 +       return 0;
10485 +}
10486 +
10487 +/**
10488 + * do_toi_step: Perform a step in hibernating or resuming.
10489 + *
10490 + * Perform a step in hibernating or resuming an image. This abstraction
10491 + * is in preparation for implementing cluster support, and perhaps replacing
10492 + * uswsusp too (haven't looked whether that's possible yet).
10493 + */
10494 +int do_toi_step(int step)
10495 +{
10496 +       switch (step) {
10497 +       case STEP_HIBERNATE_PREPARE_IMAGE:
10498 +               return do_prepare_image();
10499 +       case STEP_HIBERNATE_SAVE_IMAGE:
10500 +               return do_save_image();
10501 +       case STEP_HIBERNATE_POWERDOWN:
10502 +               return do_post_image_write();
10503 +       case STEP_RESUME_CAN_RESUME:
10504 +               return do_check_can_resume();
10505 +       case STEP_RESUME_LOAD_PS1:
10506 +               return do_load_atomic_copy();
10507 +       case STEP_RESUME_DO_RESTORE:
10508 +               /*
10509 +                * If we succeed, this doesn't return.
10510 +                * Instead, we return from do_save_image() in the
10511 +                * hibernated kernel.
10512 +                */
10513 +               return toi_atomic_restore();
10514 +       case STEP_RESUME_ALT_IMAGE:
10515 +               printk(KERN_INFO "Trying to resume alternate image.\n");
10516 +               toi_in_hibernate = 0;
10517 +               save_restore_alt_param(SAVE, NOQUIET);
10518 +               prepare_restore_load_alt_image(1);
10519 +               if (!do_check_can_resume()) {
10520 +                       printk(KERN_INFO "Nothing to resume from.\n");
10521 +                       goto out;
10522 +               }
10523 +               if (!do_load_atomic_copy())
10524 +                       toi_atomic_restore();
10525 +
10526 +               printk(KERN_INFO "Failed to load image.\n");
10527 +out:
10528 +               prepare_restore_load_alt_image(0);
10529 +               save_restore_alt_param(RESTORE, NOQUIET);
10530 +               break;
10531 +       case STEP_CLEANUP:
10532 +               do_cleanup(1);
10533 +               break;
10534 +       case STEP_QUIET_CLEANUP:
10535 +               do_cleanup(0);
10536 +               break;
10537 +       }
10538 +
10539 +       return 0;
10540 +}
10541 +EXPORT_SYMBOL_GPL(do_toi_step);
10542 +
10543 +/* -- Functions for kickstarting a hibernate or resume --- */
10544 +
10545 +/**
10546 + * __toi_try_resume: Try to do the steps in resuming.
10547 + *
10548 + * Check if we have an image and if so try to resume. Clear the status
10549 + * flags too.
10550 + */
10551 +void __toi_try_resume(void)
10552 +{
10553 +       set_toi_state(TOI_TRYING_TO_RESUME);
10554 +       resume_attempted = 1;
10555 +
10556 +       current->flags |= PF_MEMALLOC;
10557 +
10558 +       if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
10559 +           !do_toi_step(STEP_RESUME_LOAD_PS1))
10560 +           do_toi_step(STEP_RESUME_DO_RESTORE);
10561 +
10562 +       do_cleanup(0);
10563 +
10564 +       current->flags &= ~PF_MEMALLOC;
10565 +
10566 +       clear_toi_state(TOI_IGNORE_LOGLEVEL);
10567 +       clear_toi_state(TOI_TRYING_TO_RESUME);
10568 +       clear_toi_state(TOI_NOW_RESUMING);
10569 +}
10570 +
10571 +/**
10572 + * _toi_try_resume: Wrapper calling __toi_try_resume from do_mounts.
10573 + *
10574 + * Wrapper for when __toi_try_resume is called from init/do_mounts.c,
10575 + * rather than from echo > /sys/power/tuxonice/do_resume.
10576 + */
10577 +void _toi_try_resume(void)
10578 +{
10579 +       resume_attempted = 1;
10580 +
10581 +       /*
10582 +        * There's a comment in kernel/power/disk.c that indicates
10583 +        * we should be able to use mutex_lock_nested below. That
10584 +        * doesn't seem to cut it, though, so let's just turn lockdep
10585 +        * off for now.
10586 +        */
10587 +       lockdep_off();
10588 +
10589 +       if (toi_start_anything(SYSFS_RESUMING))
10590 +               goto out;
10591 +
10592 +       /* Unlock will be done in do_cleanup */
10593 +       mutex_lock(&pm_mutex);
10594 +       got_pmsem = 1;
10595 +
10596 +       __toi_try_resume();
10597 +
10598 +       /*
10599 +        * For initramfs, we have to clear the boot time
10600 +        * flag after trying to resume
10601 +        */
10602 +       clear_toi_state(TOI_BOOT_TIME);
10603 +
10604 +out:
10605 +       toi_finish_anything(SYSFS_RESUMING);
10606 +       lockdep_on();
10607 +
10608 +}
10609 +
10610 +/**
10611 + * _toi_try_hibernate: Try to start a hibernation cycle.
10612 + *
10613 + * have_pmsem: Whther the pm_sem is already taken.
10614 + *
10615 + * Start a hibernation cycle, coming in from either
10616 + * echo > /sys/power/tuxonice/do_suspend
10617 + *
10618 + * or
10619 + *
10620 + * echo disk > /sys/power/state
10621 + *
10622 + * In the later case, we come in without pm_sem taken; in the
10623 + * former, it has been taken.
10624 + */
10625 +int _toi_try_hibernate(int have_pmsem)
10626 +{
10627 +       int result = 0, sys_power_disk = 0;
10628 +
10629 +       if (!atomic_read(&actions_running)) {
10630 +               /* Came in via /sys/power/disk */
10631 +               if (toi_start_anything(SYSFS_HIBERNATING))
10632 +                       return -EBUSY;
10633 +               sys_power_disk = 1;
10634 +       }
10635 +
10636 +       get_pmsem = !have_pmsem;
10637 +
10638 +       if (strlen(alt_resume_param)) {
10639 +               attempt_to_parse_alt_resume_param();
10640 +
10641 +               if (!strlen(alt_resume_param)) {
10642 +                       printk(KERN_INFO "Alternate resume parameter now "
10643 +                                       "invalid. Aborting.\n");
10644 +                       goto out;
10645 +               }
10646 +       }
10647 +
10648 +       current->flags |= PF_MEMALLOC;
10649 +
10650 +       if (test_toi_state(TOI_CLUSTER_MODE)) {
10651 +               toi_initiate_cluster_hibernate();
10652 +               goto out;
10653 +       }
10654 +
10655 +       result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
10656 +       if (result)
10657 +               goto out;
10658 +
10659 +       if (test_action_state(TOI_FREEZER_TEST)) {
10660 +               do_cleanup(0);
10661 +               goto out;
10662 +       }
10663 +
10664 +       result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
10665 +       if (result)
10666 +               goto out;
10667 +
10668 +       /* This code runs at resume time too! */
10669 +       if (toi_in_hibernate)
10670 +               result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
10671 +out:
10672 +       current->flags &= ~PF_MEMALLOC;
10673 +
10674 +       if (sys_power_disk)
10675 +               toi_finish_anything(SYSFS_HIBERNATING);
10676 +
10677 +       return result;
10678 +}
10679 +
10680 +/*
10681 + * channel_no: If !0, -c <channel_no> is added to args (userui).
10682 + */
10683 +int toi_launch_userspace_program(char *command, int channel_no,
10684 +               enum umh_wait wait)
10685 +{
10686 +       int retval;
10687 +       static char *envp[] = {
10688 +                       "HOME=/",
10689 +                       "TERM=linux",
10690 +                       "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
10691 +                       NULL };
10692 +       static char *argv[] =
10693 +               { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
10694 +       char *channel = NULL;
10695 +       int arg = 0, size;
10696 +       char test_read[255];
10697 +       char *orig_posn = command;
10698 +
10699 +       if (!strlen(orig_posn))
10700 +               return 1;
10701 +
10702 +       if (channel_no) {
10703 +               channel = toi_kzalloc(4, 6, GFP_KERNEL);
10704 +               if (!channel) {
10705 +                       printk(KERN_INFO "Failed to allocate memory in "
10706 +                               "preparing to launch userspace program.\n");
10707 +                       return 1;
10708 +               }
10709 +       }
10710 +
10711 +       /* Up to 7 args supported */
10712 +       while (arg < 7) {
10713 +               sscanf(orig_posn, "%s", test_read);
10714 +               size = strlen(test_read);
10715 +               if (!(size))
10716 +                       break;
10717 +               argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
10718 +               strcpy(argv[arg], test_read);
10719 +               orig_posn += size + 1;
10720 +               *test_read = 0;
10721 +               arg++;
10722 +       }
10723 +
10724 +       if (channel_no) {
10725 +               sprintf(channel, "-c%d", channel_no);
10726 +               argv[arg] = channel;
10727 +       } else
10728 +               arg--;
10729 +
10730 +       retval = call_usermodehelper(argv[0], argv, envp, wait);
10731 +
10732 +       /*
10733 +        * If the program reports an error, retval = 256. Don't complain
10734 +        * about that here.
10735 +        */
10736 +       if (retval && retval != 256)
10737 +               printk("Failed to launch userspace program '%s': Error %d\n",
10738 +                               command, retval);
10739 +
10740 +       {
10741 +               int i;
10742 +               for (i = 0; i < arg; i++)
10743 +                       if (argv[i] && argv[i] != channel)
10744 +                               toi_kfree(5, argv[i]);
10745 +       }
10746 +
10747 +       toi_kfree(4, channel);
10748 +
10749 +       return retval;
10750 +}
10751 +
10752 +/*
10753 + * This array contains entries that are automatically registered at
10754 + * boot. Modules and the console code register their own entries separately.
10755 + */
10756 +static struct toi_sysfs_data sysfs_params[] = {
10757 +       { TOI_ATTR("extra_pages_allowance", SYSFS_RW),
10758 +         SYSFS_LONG(&extra_pd1_pages_allowance, 0,
10759 +                       LONG_MAX, 0)
10760 +       },
10761 +
10762 +       { TOI_ATTR("image_exists", SYSFS_RW),
10763 +         SYSFS_CUSTOM(image_exists_read, image_exists_write,
10764 +                         SYSFS_NEEDS_SM_FOR_BOTH)
10765 +       },
10766 +
10767 +       { TOI_ATTR("resume", SYSFS_RW),
10768 +         SYSFS_STRING(resume_file, 255, SYSFS_NEEDS_SM_FOR_WRITE),
10769 +         .write_side_effect = attempt_to_parse_resume_device2,
10770 +       },
10771 +
10772 +       { TOI_ATTR("alt_resume_param", SYSFS_RW),
10773 +         SYSFS_STRING(alt_resume_param, 255, SYSFS_NEEDS_SM_FOR_WRITE),
10774 +         .write_side_effect = attempt_to_parse_alt_resume_param,
10775 +       },
10776 +       { TOI_ATTR("debug_info", SYSFS_READONLY),
10777 +         SYSFS_CUSTOM(get_toi_debug_info, NULL, 0)
10778 +       },
10779 +
10780 +       { TOI_ATTR("ignore_rootfs", SYSFS_RW),
10781 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_IGNORE_ROOTFS, 0)
10782 +       },
10783 +
10784 +       { TOI_ATTR("image_size_limit", SYSFS_RW),
10785 +         SYSFS_INT(&image_size_limit, -2, INT_MAX, 0)
10786 +       },
10787 +
10788 +       { TOI_ATTR("last_result", SYSFS_RW),
10789 +         SYSFS_UL(&toi_result, 0, 0, 0)
10790 +       },
10791 +
10792 +       { TOI_ATTR("no_multithreaded_io", SYSFS_RW),
10793 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_MULTITHREADED_IO, 0)
10794 +       },
10795 +
10796 +       { TOI_ATTR("no_flusher_thread", SYSFS_RW),
10797 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_FLUSHER_THREAD, 0)
10798 +       },
10799 +
10800 +       { TOI_ATTR("full_pageset2", SYSFS_RW),
10801 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_PAGESET2_FULL, 0)
10802 +       },
10803 +
10804 +       { TOI_ATTR("reboot", SYSFS_RW),
10805 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_REBOOT, 0)
10806 +       },
10807 +
10808 +       { TOI_ATTR("replace_swsusp", SYSFS_RW),
10809 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_REPLACE_SWSUSP, 0)
10810 +       },
10811 +
10812 +       { TOI_ATTR("resume_commandline", SYSFS_RW),
10813 +         SYSFS_STRING(toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0)
10814 +       },
10815 +
10816 +       { TOI_ATTR("version", SYSFS_READONLY),
10817 +         SYSFS_STRING(TOI_CORE_VERSION, 0, 0)
10818 +       },
10819 +
10820 +       { TOI_ATTR("no_load_direct", SYSFS_RW),
10821 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_DIRECT_LOAD, 0)
10822 +       },
10823 +
10824 +       { TOI_ATTR("freezer_test", SYSFS_RW),
10825 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_FREEZER_TEST, 0)
10826 +       },
10827 +
10828 +       { TOI_ATTR("test_bio", SYSFS_RW),
10829 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_TEST_BIO, 0)
10830 +       },
10831 +
10832 +       { TOI_ATTR("test_filter_speed", SYSFS_RW),
10833 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_TEST_FILTER_SPEED, 0)
10834 +       },
10835 +
10836 +       { TOI_ATTR("no_pageset2", SYSFS_RW),
10837 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_PAGESET2, 0)
10838 +       },
10839 +
10840 +       { TOI_ATTR("late_cpu_hotplug", SYSFS_RW),
10841 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_LATE_CPU_HOTPLUG, 0)
10842 +       },
10843 +
10844 +       { TOI_ATTR("pre_hibernate_command", SYSFS_RW),
10845 +         SYSFS_STRING(pre_hibernate_command, 0, 255)
10846 +       },
10847 +
10848 +       { TOI_ATTR("post_hibernate_command", SYSFS_RW),
10849 +         SYSFS_STRING(post_hibernate_command, 0, 255)
10850 +       },
10851 +
10852 +#ifdef CONFIG_TOI_KEEP_IMAGE
10853 +       { TOI_ATTR("keep_image", SYSFS_RW),
10854 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_KEEP_IMAGE, 0)
10855 +       },
10856 +#endif
10857 +};
10858 +
10859 +struct toi_core_fns my_fns = {
10860 +       .get_nonconflicting_page = __toi_get_nonconflicting_page,
10861 +       .post_context_save = __toi_post_context_save,
10862 +       .try_hibernate = _toi_try_hibernate,
10863 +       .try_resume = _toi_try_resume,
10864 +};
10865 +
10866 +/**
10867 + * core_load: Initialisation of TuxOnIce core.
10868 + *
10869 + * Initialise the core, beginning with sysfs. Checksum and so on are part of
10870 + * the core, but have their own initialisation routines because they either
10871 + * aren't compiled in all the time or have their own subdirectories.
10872 + */
10873 +static __init int core_load(void)
10874 +{
10875 +       int i,
10876 +           numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
10877 +
10878 +       strncpy(pre_hibernate_command, CONFIG_TOI_DEFAULT_PRE_HIBERNATE, 255);
10879 +       strncpy(post_hibernate_command, CONFIG_TOI_DEFAULT_POST_HIBERNATE, 255);
10880 +
10881 +       if (toi_sysfs_init())
10882 +               return 1;
10883 +
10884 +       for (i = 0; i < numfiles; i++)
10885 +               toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
10886 +
10887 +       toi_core_fns = &my_fns;
10888 +
10889 +       if (toi_alloc_init())
10890 +               return 1;
10891 +       if (toi_checksum_init())
10892 +               return 1;
10893 +       if (toi_cluster_init())
10894 +               return 1;
10895 +       if (toi_usm_init())
10896 +               return 1;
10897 +       if (toi_ui_init())
10898 +               return 1;
10899 +       if (toi_poweroff_init())
10900 +               return 1;
10901 +
10902 +       return 0;
10903 +}
10904 +
10905 +#ifdef MODULE
10906 +/**
10907 + * core_unload: Prepare to unload the core code.
10908 + */
10909 +static __exit void core_unload(void)
10910 +{
10911 +       int i,
10912 +           numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
10913 +
10914 +       toi_alloc_exit();
10915 +       toi_poweroff_exit();
10916 +       toi_ui_exit();
10917 +       toi_checksum_exit();
10918 +       toi_cluster_exit();
10919 +       toi_usm_exit();
10920 +
10921 +       for (i = 0; i < numfiles; i++)
10922 +               toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
10923 +
10924 +       toi_core_fns = NULL;
10925 +
10926 +       toi_sysfs_exit();
10927 +}
10928 +MODULE_LICENSE("GPL");
10929 +module_init(core_load);
10930 +module_exit(core_unload);
10931 +#else
10932 +late_initcall(core_load);
10933 +#endif
10934 +
10935 +#ifdef CONFIG_TOI_EXPORTS
10936 +EXPORT_SYMBOL_GPL(tuxonice_signature);
10937 +EXPORT_SYMBOL_GPL(pagedir2);
10938 +EXPORT_SYMBOL_GPL(toi_fail_num);
10939 +EXPORT_SYMBOL_GPL(do_check_can_resume);
10940 +#endif
10941 diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
10942 new file mode 100644
10943 index 0000000..2efd973
10944 --- /dev/null
10945 +++ b/kernel/power/tuxonice_io.c
10946 @@ -0,0 +1,1427 @@
10947 +/*
10948 + * kernel/power/tuxonice_io.c
10949 + *
10950 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
10951 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
10952 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
10953 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
10954 + *
10955 + * This file is released under the GPLv2.
10956 + *
10957 + * It contains high level IO routines for hibernating.
10958 + *
10959 + */
10960 +
10961 +#include <linux/suspend.h>
10962 +#include <linux/version.h>
10963 +#include <linux/utsname.h>
10964 +#include <linux/mount.h>
10965 +#include <linux/highmem.h>
10966 +#include <linux/module.h>
10967 +#include <linux/kthread.h>
10968 +#include <linux/dyn_pageflags.h>
10969 +#include <asm/tlbflush.h>
10970 +
10971 +#include "tuxonice.h"
10972 +#include "tuxonice_modules.h"
10973 +#include "tuxonice_pageflags.h"
10974 +#include "tuxonice_io.h"
10975 +#include "tuxonice_ui.h"
10976 +#include "tuxonice_storage.h"
10977 +#include "tuxonice_prepare_image.h"
10978 +#include "tuxonice_extent.h"
10979 +#include "tuxonice_sysfs.h"
10980 +#include "tuxonice_builtin.h"
10981 +#include "tuxonice_checksum.h"
10982 +#include "tuxonice_alloc.h"
10983 +char alt_resume_param[256];
10984 +
10985 +/* Variables shared between threads and updated under the mutex */
10986 +static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
10987 +static int io_index, io_nextupdate, io_pc, io_pc_step;
10988 +static unsigned long pfn, other_pfn;
10989 +static DEFINE_MUTEX(io_mutex);
10990 +static DEFINE_PER_CPU(struct page *, last_sought);
10991 +static DEFINE_PER_CPU(struct page *, last_high_page);
10992 +static DEFINE_PER_CPU(char *, checksum_locn);
10993 +static DEFINE_PER_CPU(struct pbe *, last_low_page);
10994 +static atomic_t io_count;
10995 +atomic_t toi_io_workers;
10996 +DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
10997 +int toi_bio_queue_flusher_should_finish;
10998 +
10999 +/* toi_attempt_to_parse_resume_device
11000 + *
11001 + * Can we hibernate, using the current resume= parameter?
11002 + */
11003 +int toi_attempt_to_parse_resume_device(int quiet)
11004 +{
11005 +       struct list_head *Allocator;
11006 +       struct toi_module_ops *thisAllocator;
11007 +       int result, returning = 0;
11008 +
11009 +       if (toi_activate_storage(0))
11010 +               return 0;
11011 +
11012 +       toiActiveAllocator = NULL;
11013 +       clear_toi_state(TOI_RESUME_DEVICE_OK);
11014 +       clear_toi_state(TOI_CAN_RESUME);
11015 +       clear_result_state(TOI_ABORTED);
11016 +
11017 +       if (!toiNumAllocators) {
11018 +               if (!quiet)
11019 +                       printk(KERN_INFO "TuxOnIce: No storage allocators have "
11020 +                               "been registered. Hibernating will be "
11021 +                               "disabled.\n");
11022 +               goto cleanup;
11023 +       }
11024 +
11025 +       if (!resume_file[0]) {
11026 +               if (!quiet)
11027 +                       printk("TuxOnIce: Resume= parameter is empty."
11028 +                               " Hibernating will be disabled.\n");
11029 +               goto cleanup;
11030 +       }
11031 +
11032 +       list_for_each(Allocator, &toiAllocators) {
11033 +               thisAllocator = list_entry(Allocator, struct toi_module_ops,
11034 +                                                               type_list);
11035 +
11036 +               /*
11037 +                * Not sure why you'd want to disable an allocator, but
11038 +                * we should honour the flag if we're providing it
11039 +                */
11040 +               if (!thisAllocator->enabled)
11041 +                       continue;
11042 +
11043 +               result = thisAllocator->parse_sig_location(
11044 +                               resume_file, (toiNumAllocators == 1),
11045 +                               quiet);
11046 +
11047 +               switch (result) {
11048 +               case -EINVAL:
11049 +                       /* For this allocator, but not a valid
11050 +                        * configuration. Error already printed. */
11051 +                       goto cleanup;
11052 +
11053 +               case 0:
11054 +                       /* For this allocator and valid. */
11055 +                       toiActiveAllocator = thisAllocator;
11056 +
11057 +                       set_toi_state(TOI_RESUME_DEVICE_OK);
11058 +                       set_toi_state(TOI_CAN_RESUME);
11059 +                       returning = 1;
11060 +                       goto cleanup;
11061 +               }
11062 +       }
11063 +       if (!quiet)
11064 +               printk("TuxOnIce: No matching enabled allocator found. "
11065 +                               "Resuming disabled.\n");
11066 +cleanup:
11067 +       toi_deactivate_storage(0);
11068 +       return returning;
11069 +}
11070 +
11071 +void attempt_to_parse_resume_device2(void)
11072 +{
11073 +       toi_prepare_usm();
11074 +       toi_attempt_to_parse_resume_device(0);
11075 +       toi_cleanup_usm();
11076 +}
11077 +
11078 +void save_restore_alt_param(int replace, int quiet)
11079 +{
11080 +       static char resume_param_save[255];
11081 +       static unsigned long toi_state_save;
11082 +
11083 +       if (replace) {
11084 +               toi_state_save = toi_state;
11085 +               strcpy(resume_param_save, resume_file);
11086 +               strcpy(resume_file, alt_resume_param);
11087 +       } else {
11088 +               strcpy(resume_file, resume_param_save);
11089 +               toi_state = toi_state_save;
11090 +       }
11091 +       toi_attempt_to_parse_resume_device(quiet);
11092 +}
11093 +
11094 +void attempt_to_parse_alt_resume_param(void)
11095 +{
11096 +       int ok = 0;
11097 +
11098 +       /* Temporarily set resume_param to the poweroff value */
11099 +       if (!strlen(alt_resume_param))
11100 +               return;
11101 +
11102 +       printk("=== Trying Poweroff Resume2 ===\n");
11103 +       save_restore_alt_param(SAVE, NOQUIET);
11104 +       if (test_toi_state(TOI_CAN_RESUME))
11105 +               ok = 1;
11106 +
11107 +       printk(KERN_INFO "=== Done ===\n");
11108 +       save_restore_alt_param(RESTORE, QUIET);
11109 +
11110 +       /* If not ok, clear the string */
11111 +       if (ok)
11112 +               return;
11113 +
11114 +       printk(KERN_INFO "Can't resume from that location; clearing "
11115 +                       "alt_resume_param.\n");
11116 +       alt_resume_param[0] = '\0';
11117 +}
11118 +
11119 +/* noresume_reset_modules
11120 + *
11121 + * Description:        When we read the start of an image, modules (and especially the
11122 + *             active allocator) might need to reset data structures if we
11123 + *             decide to remove the image rather than resuming from it.
11124 + */
11125 +
11126 +static void noresume_reset_modules(void)
11127 +{
11128 +       struct toi_module_ops *this_filter;
11129 +
11130 +       list_for_each_entry(this_filter, &toi_filters, type_list)
11131 +               if (this_filter->noresume_reset)
11132 +                       this_filter->noresume_reset();
11133 +
11134 +       if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
11135 +               toiActiveAllocator->noresume_reset();
11136 +}
11137 +
11138 +/* fill_toi_header()
11139 + *
11140 + * Description:        Fill the hibernate header structure.
11141 + * Arguments:  struct toi_header: Header data structure to be filled.
11142 + */
11143 +
11144 +static int fill_toi_header(struct toi_header *sh)
11145 +{
11146 +       int i, error;
11147 +
11148 +       error = init_swsusp_header((struct swsusp_info *) sh);
11149 +       if (error)
11150 +               return error;
11151 +
11152 +       sh->pagedir = pagedir1;
11153 +       sh->pageset_2_size = pagedir2.size;
11154 +       sh->param0 = toi_result;
11155 +       sh->param1 = toi_bkd.toi_action;
11156 +       sh->param2 = toi_bkd.toi_debug_state;
11157 +       sh->param3 = toi_bkd.toi_default_console_level;
11158 +       sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
11159 +       for (i = 0; i < 4; i++)
11160 +               sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
11161 +       sh->bkd = boot_kernel_data_buffer;
11162 +       return 0;
11163 +}
11164 +
11165 +/*
11166 + * rw_init_modules
11167 + *
11168 + * Iterate over modules, preparing the ones that will be used to read or write
11169 + * data.
11170 + */
11171 +static int rw_init_modules(int rw, int which)
11172 +{
11173 +       struct toi_module_ops *this_module;
11174 +       /* Initialise page transformers */
11175 +       list_for_each_entry(this_module, &toi_filters, type_list) {
11176 +               if (!this_module->enabled)
11177 +                       continue;
11178 +               if (this_module->rw_init && this_module->rw_init(rw, which)) {
11179 +                       abort_hibernate(TOI_FAILED_MODULE_INIT,
11180 +                               "Failed to initialise the %s filter.",
11181 +                               this_module->name);
11182 +                       return 1;
11183 +               }
11184 +       }
11185 +
11186 +       /* Initialise allocator */
11187 +       if (toiActiveAllocator->rw_init(rw, which)) {
11188 +               abort_hibernate(TOI_FAILED_MODULE_INIT,
11189 +                               "Failed to initialise the allocator.");
11190 +               return 1;
11191 +       }
11192 +
11193 +       /* Initialise other modules */
11194 +       list_for_each_entry(this_module, &toi_modules, module_list) {
11195 +               if (!this_module->enabled ||
11196 +                   this_module->type == FILTER_MODULE ||
11197 +                   this_module->type == WRITER_MODULE)
11198 +                       continue;
11199 +               if (this_module->rw_init && this_module->rw_init(rw, which)) {
11200 +                       set_abort_result(TOI_FAILED_MODULE_INIT);
11201 +                       printk(KERN_INFO "Setting aborted flag due to module "
11202 +                                       "init failure.\n");
11203 +                       return 1;
11204 +               }
11205 +       }
11206 +
11207 +       return 0;
11208 +}
11209 +
11210 +/*
11211 + * rw_cleanup_modules
11212 + *
11213 + * Cleanup components after reading or writing a set of pages.
11214 + * Only the allocator may fail.
11215 + */
11216 +static int rw_cleanup_modules(int rw)
11217 +{
11218 +       struct toi_module_ops *this_module;
11219 +       int result = 0;
11220 +
11221 +       /* Cleanup other modules */
11222 +       list_for_each_entry(this_module, &toi_modules, module_list) {
11223 +               if (!this_module->enabled ||
11224 +                   this_module->type == FILTER_MODULE ||
11225 +                   this_module->type == WRITER_MODULE)
11226 +                       continue;
11227 +               if (this_module->rw_cleanup)
11228 +                       result |= this_module->rw_cleanup(rw);
11229 +       }
11230 +
11231 +       /* Flush data and cleanup */
11232 +       list_for_each_entry(this_module, &toi_filters, type_list) {
11233 +               if (!this_module->enabled)
11234 +                       continue;
11235 +               if (this_module->rw_cleanup)
11236 +                       result |= this_module->rw_cleanup(rw);
11237 +       }
11238 +
11239 +       result |= toiActiveAllocator->rw_cleanup(rw);
11240 +
11241 +       return result;
11242 +}
11243 +
11244 +static struct page *copy_page_from_orig_page(struct page *orig_page)
11245 +{
11246 +       int is_high = PageHighMem(orig_page), index, min, max;
11247 +       struct page *high_page = NULL,
11248 +                   **my_last_high_page = &__get_cpu_var(last_high_page),
11249 +                   **my_last_sought = &__get_cpu_var(last_sought);
11250 +       struct pbe *this, **my_last_low_page = &__get_cpu_var(last_low_page);
11251 +       void *compare;
11252 +
11253 +       if (is_high) {
11254 +               if (*my_last_sought && *my_last_high_page &&
11255 +                               *my_last_sought < orig_page)
11256 +                       high_page = *my_last_high_page;
11257 +               else
11258 +                       high_page = (struct page *) restore_highmem_pblist;
11259 +               this = (struct pbe *) kmap(high_page);
11260 +               compare = orig_page;
11261 +       } else {
11262 +               if (*my_last_sought && *my_last_low_page &&
11263 +                               *my_last_sought < orig_page)
11264 +                       this = *my_last_low_page;
11265 +               else
11266 +                       this = restore_pblist;
11267 +               compare = page_address(orig_page);
11268 +       }
11269 +
11270 +       *my_last_sought = orig_page;
11271 +
11272 +       /* Locate page containing pbe */
11273 +       while (this[PBES_PER_PAGE - 1].next &&
11274 +                       this[PBES_PER_PAGE - 1].orig_address < compare) {
11275 +               if (is_high) {
11276 +                       struct page *next_high_page = (struct page *)
11277 +                               this[PBES_PER_PAGE - 1].next;
11278 +                       kunmap(high_page);
11279 +                       this = kmap(next_high_page);
11280 +                       high_page = next_high_page;
11281 +               } else
11282 +                       this = this[PBES_PER_PAGE - 1].next;
11283 +       }
11284 +
11285 +       /* Do a binary search within the page */
11286 +       min = 0;
11287 +       max = PBES_PER_PAGE;
11288 +       index = PBES_PER_PAGE / 2;
11289 +       while (max - min) {
11290 +               if (!this[index].orig_address ||
11291 +                   this[index].orig_address > compare)
11292 +                       max = index;
11293 +               else if (this[index].orig_address == compare) {
11294 +                       if (is_high) {
11295 +                               struct page *page = this[index].address;
11296 +                               *my_last_high_page = high_page;
11297 +                               kunmap(high_page);
11298 +                               return page;
11299 +                       }
11300 +                       *my_last_low_page = this;
11301 +                       return virt_to_page(this[index].address);
11302 +               } else
11303 +                       min = index;
11304 +               index = ((max + min) / 2);
11305 +       };
11306 +
11307 +       if (is_high)
11308 +               kunmap(high_page);
11309 +
11310 +       abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
11311 +               " orig page %p. This[min].orig_address=%p.\n", orig_page,
11312 +               this[index].orig_address);
11313 +       return NULL;
11314 +}
11315 +
11316 +/*
11317 + * do_rw_loop
11318 + *
11319 + * The main I/O loop for reading or writing pages.
11320 + */
11321 +static int worker_rw_loop(void *data)
11322 +{
11323 +       unsigned long orig_pfn, write_pfn;
11324 +       int result, my_io_index = 0, temp;
11325 +       struct toi_module_ops *first_filter = toi_get_next_filter(NULL);
11326 +       struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
11327 +
11328 +       atomic_inc(&toi_io_workers);
11329 +       mutex_lock(&io_mutex);
11330 +
11331 +       do {
11332 +               int buf_size;
11333 +
11334 +               /*
11335 +                * What page to use? If reading, don't know yet which page's
11336 +                * data will be read, so always use the buffer. If writing,
11337 +                * use the copy (Pageset1) or original page (Pageset2), but
11338 +                * always write the pfn of the original page.
11339 +                */
11340 +               if (io_write) {
11341 +                       struct page *page;
11342 +                       char **my_checksum_locn = &__get_cpu_var(checksum_locn);
11343 +
11344 +                       pfn = get_next_bit_on(&io_map, pfn);
11345 +
11346 +                       /* Another thread could have beaten us to it. */
11347 +                       if (pfn == max_pfn + 1) {
11348 +                               if (atomic_read(&io_count)) {
11349 +                                       printk("Ran out of pfns but io_count "
11350 +                                               "is still %d.\n",
11351 +                                               atomic_read(&io_count));
11352 +                                       BUG();
11353 +                               }
11354 +                               break;
11355 +                       }
11356 +
11357 +                       my_io_index = io_finish_at -
11358 +                               atomic_sub_return(1, &io_count);
11359 +
11360 +                       orig_pfn = pfn;
11361 +                       write_pfn = pfn;
11362 +
11363 +                       /*
11364 +                        * Other_pfn is updated by all threads, so we're not
11365 +                        * writing the same page multiple times.
11366 +                        */
11367 +                       clear_dynpageflag(&io_map, pfn_to_page(pfn));
11368 +                       if (io_pageset == 1) {
11369 +                               other_pfn = get_next_bit_on(&pageset1_map,
11370 +                                               other_pfn);
11371 +                               write_pfn = other_pfn;
11372 +                       }
11373 +                       page = pfn_to_page(pfn);
11374 +
11375 +                       if (io_pageset == 2)
11376 +                               *my_checksum_locn =
11377 +                                       tuxonice_get_next_checksum();
11378 +
11379 +                       mutex_unlock(&io_mutex);
11380 +
11381 +                       if (io_pageset == 2 &&
11382 +                           tuxonice_calc_checksum(page, *my_checksum_locn))
11383 +                                       return 1;
11384 +
11385 +                       result = first_filter->write_page(write_pfn, page,
11386 +                                       PAGE_SIZE);
11387 +               } else {
11388 +                       my_io_index = io_finish_at -
11389 +                               atomic_sub_return(1, &io_count);
11390 +                       mutex_unlock(&io_mutex);
11391 +
11392 +                       /*
11393 +                        * Are we aborting? If so, don't submit any more I/O as
11394 +                        * resetting the resume_attempted flag (from ui.c) will
11395 +                        * clear the bdev flags, making this thread oops.
11396 +                        */
11397 +                       if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
11398 +                               atomic_dec(&toi_io_workers);
11399 +                               if (!atomic_read(&toi_io_workers))
11400 +                                       set_toi_state(TOI_IO_STOPPED);
11401 +                               while (1)
11402 +                                       schedule();
11403 +                       }
11404 +
11405 +                       result = first_filter->read_page(&write_pfn, buffer,
11406 +                                       &buf_size);
11407 +                       if (buf_size != PAGE_SIZE) {
11408 +                               abort_hibernate(TOI_FAILED_IO,
11409 +                                       "I/O pipeline returned %d bytes instead"
11410 +                                       " of %d.\n", buf_size, PAGE_SIZE);
11411 +                               mutex_lock(&io_mutex);
11412 +                               break;
11413 +                       }
11414 +               }
11415 +
11416 +               if (result) {
11417 +                       io_result = result;
11418 +                       if (io_write) {
11419 +                               printk(KERN_INFO "Write chunk returned %d.\n",
11420 +                                               result);
11421 +                               abort_hibernate(TOI_FAILED_IO,
11422 +                                       "Failed to write a chunk of the "
11423 +                                       "image.");
11424 +                               mutex_lock(&io_mutex);
11425 +                               break;
11426 +                       }
11427 +                       panic("Read chunk returned (%d)", result);
11428 +               }
11429 +
11430 +               /*
11431 +                * Discard reads of resaved pages while reading ps2
11432 +                * and unwanted pages while rereading ps2 when aborting.
11433 +                */
11434 +               if (!io_write && !PageResave(pfn_to_page(write_pfn))) {
11435 +                       struct page *final_page = pfn_to_page(write_pfn),
11436 +                                   *copy_page = final_page;
11437 +                       char *virt, *buffer_virt;
11438 +
11439 +                       if (io_pageset == 1 && !load_direct(final_page)) {
11440 +                               copy_page =
11441 +                                       copy_page_from_orig_page(final_page);
11442 +                               BUG_ON(!copy_page);
11443 +                       }
11444 +
11445 +                       if (test_dynpageflag(&io_map, final_page)) {
11446 +                               virt = kmap(copy_page);
11447 +                               buffer_virt = kmap(buffer);
11448 +                               memcpy(virt, buffer_virt, PAGE_SIZE);
11449 +                               kunmap(copy_page);
11450 +                               kunmap(buffer);
11451 +                               clear_dynpageflag(&io_map, final_page);
11452 +                       } else {
11453 +                               mutex_lock(&io_mutex);
11454 +                               atomic_inc(&io_count);
11455 +                               mutex_unlock(&io_mutex);
11456 +                       }
11457 +               }
11458 +
11459 +               temp = my_io_index + io_base - io_nextupdate;
11460 +
11461 +               if (my_io_index + io_base == io_nextupdate)
11462 +                       io_nextupdate = toi_update_status(my_io_index +
11463 +                               io_base, io_barmax, " %d/%d MB ",
11464 +                               MB(io_base+my_io_index+1), MB(io_barmax));
11465 +
11466 +               if (my_io_index == io_pc) {
11467 +                       printk("%s%d%%...", io_pc_step == 1 ? KERN_ERR : "",
11468 +                                       20 * io_pc_step);
11469 +                       io_pc_step++;
11470 +                       io_pc = io_finish_at * io_pc_step / 5;
11471 +               }
11472 +
11473 +               toi_cond_pause(0, NULL);
11474 +
11475 +               /*
11476 +                * Subtle: If there's less I/O still to be done than threads
11477 +                * running, quit. This stops us doing I/O beyond the end of
11478 +                * the image when reading.
11479 +                *
11480 +                * Possible race condition. Two threads could do the test at
11481 +                * the same time; one should exit and one should continue.
11482 +                * Therefore we take the mutex before comparing and exiting.
11483 +                */
11484 +
11485 +               mutex_lock(&io_mutex);
11486 +
11487 +       } while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
11488 +               !(io_write && test_result_state(TOI_ABORTED)));
11489 +
11490 +       if (atomic_dec_and_test(&toi_io_workers)) {
11491 +               toi_bio_queue_flusher_should_finish = 1;
11492 +               wake_up(&toi_io_queue_flusher);
11493 +       }
11494 +       mutex_unlock(&io_mutex);
11495 +
11496 +       toi__free_page(28, buffer);
11497 +
11498 +       return 0;
11499 +}
11500 +
11501 +int start_other_threads(void)
11502 +{
11503 +       int cpu, num_started = 0;
11504 +       struct task_struct *p;
11505 +
11506 +       for_each_online_cpu(cpu) {
11507 +               if (cpu == smp_processor_id())
11508 +                       continue;
11509 +
11510 +               p = kthread_create(worker_rw_loop, NULL, "ks2io/%d", cpu);
11511 +               if (IS_ERR(p)) {
11512 +                       printk("ks2io for %i failed\n", cpu);
11513 +                       continue;
11514 +               }
11515 +               kthread_bind(p, cpu);
11516 +               p->flags |= PF_MEMALLOC;
11517 +               wake_up_process(p);
11518 +               num_started++;
11519 +       }
11520 +
11521 +       return num_started;
11522 +}
11523 +
11524 +/*
11525 + * do_rw_loop
11526 + *
11527 + * The main I/O loop for reading or writing pages.
11528 + */
11529 +static int do_rw_loop(int write, int finish_at, struct dyn_pageflags *pageflags,
11530 +               int base, int barmax, int pageset)
11531 +{
11532 +       int index = 0, cpu, num_other_threads = 0;
11533 +
11534 +       if (!finish_at)
11535 +               return 0;
11536 +
11537 +       io_write = write;
11538 +       io_finish_at = finish_at;
11539 +       io_base = base;
11540 +       io_barmax = barmax;
11541 +       io_pageset = pageset;
11542 +       io_index = 0;
11543 +       io_pc = io_finish_at / 5;
11544 +       io_pc_step = 1;
11545 +       io_result = 0;
11546 +       io_nextupdate = base + 1;
11547 +       toi_bio_queue_flusher_should_finish = 0;
11548 +
11549 +       for_each_online_cpu(cpu) {
11550 +               per_cpu(last_sought, cpu) = NULL;
11551 +               per_cpu(last_low_page, cpu) = NULL;
11552 +               per_cpu(last_high_page, cpu) = NULL;
11553 +       }
11554 +
11555 +       /* Ensure all bits clear */
11556 +       clear_dyn_pageflags(&io_map);
11557 +
11558 +       /* Set the bits for the pages to write */
11559 +       pfn = get_next_bit_on(pageflags, max_pfn + 1);
11560 +
11561 +       while (pfn < max_pfn + 1 && index < finish_at) {
11562 +               set_dynpageflag(&io_map, pfn_to_page(pfn));
11563 +               pfn = get_next_bit_on(pageflags, pfn);
11564 +               index++;
11565 +       }
11566 +
11567 +       BUG_ON(index < finish_at);
11568 +
11569 +       atomic_set(&io_count, finish_at);
11570 +
11571 +       pfn = max_pfn + 1;
11572 +       other_pfn = pfn;
11573 +
11574 +       clear_toi_state(TOI_IO_STOPPED);
11575 +
11576 +       if (!test_action_state(TOI_NO_MULTITHREADED_IO))
11577 +               num_other_threads = start_other_threads();
11578 +
11579 +       if (!num_other_threads || !toiActiveAllocator->io_flusher ||
11580 +               test_action_state(TOI_NO_FLUSHER_THREAD))
11581 +               worker_rw_loop(NULL);
11582 +       else
11583 +               toiActiveAllocator->io_flusher(write);
11584 +
11585 +       while (atomic_read(&toi_io_workers))
11586 +               schedule();
11587 +
11588 +       set_toi_state(TOI_IO_STOPPED);
11589 +       if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
11590 +               while (1)
11591 +                       schedule();
11592 +       }
11593 +
11594 +       if (!io_result) {
11595 +               printk("done.\n");
11596 +
11597 +               toi_update_status(io_base + io_finish_at, io_barmax,
11598 +                               " %d/%d MB ",
11599 +                               MB(io_base + io_finish_at), MB(io_barmax));
11600 +       }
11601 +
11602 +       if (io_write && test_result_state(TOI_ABORTED))
11603 +               io_result = 1;
11604 +       else { /* All I/O done? */
11605 +               if  (get_next_bit_on(&io_map, max_pfn + 1) != max_pfn + 1) {
11606 +                       printk(KERN_INFO "Finished I/O loop but still work to "
11607 +                                       "do?\nFinish at = %d. io_count = %d.\n",
11608 +                                       finish_at, atomic_read(&io_count));
11609 +                       BUG();
11610 +               }
11611 +       }
11612 +
11613 +       return io_result;
11614 +}
11615 +
11616 +/* write_pageset()
11617 + *
11618 + * Description:        Write a pageset to disk.
11619 + * Arguments:  pagedir:        Which pagedir to write..
11620 + * Returns:    Zero on success or -1 on failure.
11621 + */
11622 +
11623 +int write_pageset(struct pagedir *pagedir)
11624 +{
11625 +       int finish_at, base = 0, start_time, end_time;
11626 +       int barmax = pagedir1.size + pagedir2.size;
11627 +       long error = 0;
11628 +       struct dyn_pageflags *pageflags;
11629 +
11630 +       /*
11631 +        * Even if there is nothing to read or write, the allocator
11632 +        * may need the init/cleanup for it's housekeeping.  (eg:
11633 +        * Pageset1 may start where pageset2 ends when writing).
11634 +        */
11635 +       finish_at = pagedir->size;
11636 +
11637 +       if (pagedir->id == 1) {
11638 +               toi_prepare_status(DONT_CLEAR_BAR,
11639 +                               "Writing kernel & process data...");
11640 +               base = pagedir2.size;
11641 +               if (test_action_state(TOI_TEST_FILTER_SPEED) ||
11642 +                   test_action_state(TOI_TEST_BIO))
11643 +                       pageflags = &pageset1_map;
11644 +               else
11645 +                       pageflags = &pageset1_copy_map;
11646 +       } else {
11647 +               toi_prepare_status(CLEAR_BAR, "Writing caches...");
11648 +               pageflags = &pageset2_map;
11649 +       }
11650 +
11651 +       start_time = jiffies;
11652 +
11653 +       if (rw_init_modules(1, pagedir->id)) {
11654 +               abort_hibernate(TOI_FAILED_MODULE_INIT,
11655 +                               "Failed to initialise modules for writing.");
11656 +               error = 1;
11657 +       }
11658 +
11659 +       if (!error)
11660 +               error = do_rw_loop(1, finish_at, pageflags, base, barmax,
11661 +                               pagedir->id);
11662 +
11663 +       if (rw_cleanup_modules(WRITE) && !error) {
11664 +               abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
11665 +                               "Failed to cleanup after writing.");
11666 +               error = 1;
11667 +       }
11668 +
11669 +       end_time = jiffies;
11670 +
11671 +       if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
11672 +               toi_bkd.toi_io_time[0][0] += finish_at,
11673 +               toi_bkd.toi_io_time[0][1] += (end_time - start_time);
11674 +       }
11675 +
11676 +       return error;
11677 +}
11678 +
11679 +/* read_pageset()
11680 + *
11681 + * Description:        Read a pageset from disk.
11682 + * Arguments:  whichtowrite:   Controls what debugging output is printed.
11683 + *             overwrittenpagesonly: Whether to read the whole pageset or
11684 + *             only part.
11685 + * Returns:    Zero on success or -1 on failure.
11686 + */
11687 +
11688 +static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
11689 +{
11690 +       int result = 0, base = 0, start_time, end_time;
11691 +       int finish_at = pagedir->size;
11692 +       int barmax = pagedir1.size + pagedir2.size;
11693 +       struct dyn_pageflags *pageflags;
11694 +
11695 +       if (pagedir->id == 1) {
11696 +               toi_prepare_status(CLEAR_BAR,
11697 +                               "Reading kernel & process data...");
11698 +               pageflags = &pageset1_map;
11699 +       } else {
11700 +               toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
11701 +               if (overwrittenpagesonly)
11702 +                       barmax = finish_at = min(pagedir1.size,
11703 +                                                pagedir2.size);
11704 +               else
11705 +                       base = pagedir1.size;
11706 +               pageflags = &pageset2_map;
11707 +       }
11708 +
11709 +       start_time = jiffies;
11710 +
11711 +       if (rw_init_modules(0, pagedir->id)) {
11712 +               toiActiveAllocator->remove_image();
11713 +               result = 1;
11714 +       } else
11715 +               result = do_rw_loop(0, finish_at, pageflags, base, barmax,
11716 +                               pagedir->id);
11717 +
11718 +       if (rw_cleanup_modules(READ) && !result) {
11719 +               abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
11720 +                               "Failed to cleanup after reading.");
11721 +               result = 1;
11722 +       }
11723 +
11724 +       /* Statistics */
11725 +       end_time = jiffies;
11726 +
11727 +       if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
11728 +               toi_bkd.toi_io_time[1][0] += finish_at,
11729 +               toi_bkd.toi_io_time[1][1] += (end_time - start_time);
11730 +       }
11731 +
11732 +       return result;
11733 +}
11734 +
11735 +/* write_module_configs()
11736 + *
11737 + * Description:        Store the configuration for each module in the image header.
11738 + * Returns:    Int: Zero on success, Error value otherwise.
11739 + */
11740 +static int write_module_configs(void)
11741 +{
11742 +       struct toi_module_ops *this_module;
11743 +       char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
11744 +       int len, index = 1;
11745 +       struct toi_module_header toi_module_header;
11746 +
11747 +       if (!buffer) {
11748 +               printk(KERN_INFO "Failed to allocate a buffer for saving "
11749 +                               "module configuration info.\n");
11750 +               return -ENOMEM;
11751 +       }
11752 +
11753 +       /*
11754 +        * We have to know which data goes with which module, so we at
11755 +        * least write a length of zero for a module. Note that we are
11756 +        * also assuming every module's config data takes <= PAGE_SIZE.
11757 +        */
11758 +
11759 +       /* For each module (in registration order) */
11760 +       list_for_each_entry(this_module, &toi_modules, module_list) {
11761 +               if (!this_module->enabled || !this_module->storage_needed ||
11762 +                   (this_module->type == WRITER_MODULE &&
11763 +                    toiActiveAllocator != this_module))
11764 +                       continue;
11765 +
11766 +               /* Get the data from the module */
11767 +               len = 0;
11768 +               if (this_module->save_config_info)
11769 +                       len = this_module->save_config_info(buffer);
11770 +
11771 +               /* Save the details of the module */
11772 +               toi_module_header.enabled = this_module->enabled;
11773 +               toi_module_header.type = this_module->type;
11774 +               toi_module_header.index = index++;
11775 +               strncpy(toi_module_header.name, this_module->name,
11776 +                                       sizeof(toi_module_header.name));
11777 +               toiActiveAllocator->rw_header_chunk(WRITE,
11778 +                               this_module,
11779 +                               (char *) &toi_module_header,
11780 +                               sizeof(toi_module_header));
11781 +
11782 +               /* Save the size of the data and any data returned */
11783 +               toiActiveAllocator->rw_header_chunk(WRITE,
11784 +                               this_module,
11785 +                               (char *) &len, sizeof(int));
11786 +               if (len)
11787 +                       toiActiveAllocator->rw_header_chunk(
11788 +                               WRITE, this_module, buffer, len);
11789 +       }
11790 +
11791 +       /* Write a blank header to terminate the list */
11792 +       toi_module_header.name[0] = '\0';
11793 +       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
11794 +                       (char *) &toi_module_header, sizeof(toi_module_header));
11795 +
11796 +       toi_free_page(22, (unsigned long) buffer);
11797 +       return 0;
11798 +}
11799 +
11800 +/* read_module_configs()
11801 + *
11802 + * Description:        Reload module configurations from the image header.
11803 + * Returns:    Int. Zero on success, error value otherwise.
11804 + */
11805 +
11806 +static int read_module_configs(void)
11807 +{
11808 +       struct toi_module_ops *this_module;
11809 +       char *buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
11810 +       int len, result = 0;
11811 +       struct toi_module_header toi_module_header;
11812 +
11813 +       if (!buffer) {
11814 +               printk("Failed to allocate a buffer for reloading module "
11815 +                               "configuration info.\n");
11816 +               return -ENOMEM;
11817 +       }
11818 +
11819 +       /* All modules are initially disabled. That way, if we have a module
11820 +        * loaded now that wasn't loaded when we hibernated, it won't be used
11821 +        * in trying to read the data.
11822 +        */
11823 +       list_for_each_entry(this_module, &toi_modules, module_list)
11824 +               this_module->enabled = 0;
11825 +
11826 +       /* Get the first module header */
11827 +       result = toiActiveAllocator->rw_header_chunk(READ, NULL,
11828 +                       (char *) &toi_module_header,
11829 +                       sizeof(toi_module_header));
11830 +       if (result) {
11831 +               printk("Failed to read the next module header.\n");
11832 +               toi_free_page(23, (unsigned long) buffer);
11833 +               return -EINVAL;
11834 +       }
11835 +
11836 +       /* For each module (in registration order) */
11837 +       while (toi_module_header.name[0]) {
11838 +
11839 +               /* Find the module */
11840 +               this_module =
11841 +                       toi_find_module_given_name(toi_module_header.name);
11842 +
11843 +               if (!this_module) {
11844 +                       /*
11845 +                        * Is it used? Only need to worry about filters. The
11846 +                        * active allocator must be loaded!
11847 +                        */
11848 +                       if (toi_module_header.enabled) {
11849 +                               toi_early_boot_message(1, TOI_CONTINUE_REQ,
11850 +                                       "It looks like we need module %s for "
11851 +                                       "reading the image but it hasn't been "
11852 +                                       "registered.\n",
11853 +                                       toi_module_header.name);
11854 +                               if (!(test_toi_state(TOI_CONTINUE_REQ))) {
11855 +                                       toi_free_page(23,
11856 +                                                       (unsigned long) buffer);
11857 +                                       return -EINVAL;
11858 +                               }
11859 +                       } else
11860 +                               printk(KERN_INFO "Module %s configuration data "
11861 +                                       "found, but the module hasn't "
11862 +                                       "registered. Looks like it was "
11863 +                                       "disabled, so we're ignoring its data.",
11864 +                                       toi_module_header.name);
11865 +               }
11866 +
11867 +               /* Get the length of the data (if any) */
11868 +               result = toiActiveAllocator->rw_header_chunk(READ, NULL,
11869 +                               (char *) &len, sizeof(int));
11870 +               if (result) {
11871 +                       printk("Failed to read the length of the module %s's"
11872 +                                       " configuration data.\n",
11873 +                                       toi_module_header.name);
11874 +                       toi_free_page(23, (unsigned long) buffer);
11875 +                       return -EINVAL;
11876 +               }
11877 +
11878 +               /* Read any data and pass to the module (if we found one) */
11879 +               if (len) {
11880 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
11881 +                                       buffer, len);
11882 +                       if (this_module) {
11883 +                               if (!this_module->save_config_info) {
11884 +                                       printk("Huh? Module %s appears to have "
11885 +                                               "a save_config_info, but not a "
11886 +                                               "load_config_info function!\n",
11887 +                                               this_module->name);
11888 +                               } else
11889 +                                       this_module->load_config_info(buffer,
11890 +                                                       len);
11891 +                       }
11892 +               }
11893 +
11894 +               if (this_module) {
11895 +                       /* Now move this module to the tail of its lists. This
11896 +                        * will put it in order. Any new modules will end up at
11897 +                        * the top of the lists. They should have been set to
11898 +                        * disabled when loaded (people will normally not edit
11899 +                        * an initrd to load a new module and then hibernate
11900 +                        * without using it!).
11901 +                        */
11902 +
11903 +                       toi_move_module_tail(this_module);
11904 +
11905 +                       /*
11906 +                        * We apply the disabled state; modules don't need to
11907 +                        * save whether they were disabled and if they do, we
11908 +                        * override them anyway.
11909 +                        */
11910 +                       this_module->enabled = toi_module_header.enabled;
11911 +               }
11912 +
11913 +               /* Get the next module header */
11914 +               result = toiActiveAllocator->rw_header_chunk(READ, NULL,
11915 +                               (char *) &toi_module_header,
11916 +                               sizeof(toi_module_header));
11917 +
11918 +               if (result) {
11919 +                       printk("Failed to read the next module header.\n");
11920 +                       toi_free_page(23, (unsigned long) buffer);
11921 +                       return -EINVAL;
11922 +               }
11923 +
11924 +       }
11925 +
11926 +       toi_free_page(23, (unsigned long) buffer);
11927 +       return 0;
11928 +}
11929 +
11930 +/* write_image_header()
11931 + *
11932 + * Description:        Write the image header after write the image proper.
11933 + * Returns:    Int. Zero on success or -1 on failure.
11934 + */
11935 +
11936 +int write_image_header(void)
11937 +{
11938 +       int ret;
11939 +       int total = pagedir1.size + pagedir2.size+2;
11940 +       char *header_buffer = NULL;
11941 +
11942 +       /* Now prepare to write the header */
11943 +       ret = toiActiveAllocator->write_header_init();
11944 +       if (ret) {
11945 +               abort_hibernate(TOI_FAILED_MODULE_INIT,
11946 +                               "Active allocator's write_header_init"
11947 +                               " function failed.");
11948 +               goto write_image_header_abort;
11949 +       }
11950 +
11951 +       /* Get a buffer */
11952 +       header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
11953 +       if (!header_buffer) {
11954 +               abort_hibernate(TOI_OUT_OF_MEMORY,
11955 +                       "Out of memory when trying to get page for header!");
11956 +               goto write_image_header_abort;
11957 +       }
11958 +
11959 +       /* Write hibernate header */
11960 +       if (fill_toi_header((struct toi_header *) header_buffer)) {
11961 +               abort_hibernate(TOI_OUT_OF_MEMORY,
11962 +                       "Failure to fill header information!");
11963 +               goto write_image_header_abort;
11964 +       }
11965 +       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
11966 +                       header_buffer, sizeof(struct toi_header));
11967 +
11968 +       toi_free_page(24, (unsigned long) header_buffer);
11969 +
11970 +       /* Write module configurations */
11971 +       ret = write_module_configs();
11972 +       if (ret) {
11973 +               abort_hibernate(TOI_FAILED_IO,
11974 +                               "Failed to write module configs.");
11975 +               goto write_image_header_abort;
11976 +       }
11977 +
11978 +       save_dyn_pageflags(&pageset1_map);
11979 +
11980 +       /* Flush data and let allocator cleanup */
11981 +       if (toiActiveAllocator->write_header_cleanup()) {
11982 +               abort_hibernate(TOI_FAILED_IO,
11983 +                               "Failed to cleanup writing header.");
11984 +               goto write_image_header_abort_no_cleanup;
11985 +       }
11986 +
11987 +       if (test_result_state(TOI_ABORTED))
11988 +               goto write_image_header_abort_no_cleanup;
11989 +
11990 +       toi_update_status(total, total, NULL);
11991 +
11992 +       return 0;
11993 +
11994 +write_image_header_abort:
11995 +       toiActiveAllocator->write_header_cleanup();
11996 +write_image_header_abort_no_cleanup:
11997 +       return -1;
11998 +}
11999 +
12000 +/* sanity_check()
12001 + *
12002 + * Description:        Perform a few checks, seeking to ensure that the kernel being
12003 + *             booted matches the one hibernated. They need to match so we can
12004 + *             be _sure_ things will work. It is not absolutely impossible for
12005 + *             resuming from a different kernel to work, just not assured.
12006 + * Arguments:  Struct toi_header. The header which was saved at hibernate
12007 + *             time.
12008 + */
12009 +static char *sanity_check(struct toi_header *sh)
12010 +{
12011 +       char *reason = check_swsusp_image_kernel((struct swsusp_info *) sh);
12012 +
12013 +       if (reason)
12014 +               return reason;
12015 +
12016 +       if (!test_action_state(TOI_IGNORE_ROOTFS)) {
12017 +               const struct super_block *sb;
12018 +               list_for_each_entry(sb, &super_blocks, s_list) {
12019 +                       if ((!(sb->s_flags & MS_RDONLY)) &&
12020 +                           (sb->s_type->fs_flags & FS_REQUIRES_DEV))
12021 +                               return "Device backed fs has been mounted "
12022 +                                       "rw prior to resume or initrd/ramfs "
12023 +                                       "is mounted rw.";
12024 +               }
12025 +       }
12026 +
12027 +       return 0;
12028 +}
12029 +
12030 +/* __read_pageset1
12031 + *
12032 + * Description:        Test for the existence of an image and attempt to load it.
12033 + * Returns:    Int. Zero if image found and pageset1 successfully loaded.
12034 + *             Error if no image found or loaded.
12035 + */
12036 +static int __read_pageset1(void)
12037 +{
12038 +       int i, result = 0;
12039 +       char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
12040 +            *sanity_error = NULL;
12041 +       struct toi_header *toi_header;
12042 +
12043 +       if (!header_buffer) {
12044 +               printk(KERN_INFO "Unable to allocate a page for reading the "
12045 +                               "signature.\n");
12046 +               return -ENOMEM;
12047 +       }
12048 +
12049 +       /* Check for an image */
12050 +       result = toiActiveAllocator->image_exists(1);
12051 +       if (!result) {
12052 +               result = -ENODATA;
12053 +               noresume_reset_modules();
12054 +               printk(KERN_INFO "TuxOnIce: No image found.\n");
12055 +               goto out;
12056 +       }
12057 +
12058 +       /*
12059 +        * Prepare the active allocator for reading the image header. The
12060 +        * activate allocator might read its own configuration.
12061 +        *
12062 +        * NB: This call may never return because there might be a signature
12063 +        * for a different image such that we warn the user and they choose
12064 +        * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
12065 +        * location of the image might be unavailable if it was stored on a
12066 +        * network connection).
12067 +        */
12068 +
12069 +       result = toiActiveAllocator->read_header_init();
12070 +       if (result) {
12071 +               printk("TuxOnIce: Failed to initialise, reading the image "
12072 +                               "header.\n");
12073 +               goto out_remove_image;
12074 +       }
12075 +
12076 +       /* Check for noresume command line option */
12077 +       if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
12078 +               printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
12079 +                               "image.\n");
12080 +               goto out_remove_image;
12081 +       }
12082 +
12083 +       /* Check whether we've resumed before */
12084 +       if (test_toi_state(TOI_RESUMED_BEFORE)) {
12085 +               toi_early_boot_message(1, 0, NULL);
12086 +               if (!(test_toi_state(TOI_CONTINUE_REQ))) {
12087 +                       printk(KERN_INFO "TuxOnIce: Tried to resume before: "
12088 +                                       "Invalidated image.\n");
12089 +                       goto out_remove_image;
12090 +               }
12091 +       }
12092 +
12093 +       clear_toi_state(TOI_CONTINUE_REQ);
12094 +
12095 +       /* Read hibernate header */
12096 +       result = toiActiveAllocator->rw_header_chunk(READ, NULL,
12097 +                       header_buffer, sizeof(struct toi_header));
12098 +       if (result < 0) {
12099 +               printk("TuxOnIce: Failed to read the image signature.\n");
12100 +               goto out_remove_image;
12101 +       }
12102 +
12103 +       toi_header = (struct toi_header *) header_buffer;
12104 +
12105 +       /*
12106 +        * NB: This call may also result in a reboot rather than returning.
12107 +        */
12108 +
12109 +       sanity_error = sanity_check(toi_header);
12110 +       if (sanity_error) {
12111 +               toi_early_boot_message(1, TOI_CONTINUE_REQ,
12112 +                               sanity_error);
12113 +               printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
12114 +               goto out_remove_image;
12115 +       }
12116 +
12117 +       /*
12118 +        * We have an image and it looks like it will load okay.
12119 +        *
12120 +        * Get metadata from header. Don't override commandline parameters.
12121 +        *
12122 +        * We don't need to save the image size limit because it's not used
12123 +        * during resume and will be restored with the image anyway.
12124 +        */
12125 +
12126 +       memcpy((char *) &pagedir1,
12127 +               (char *) &toi_header->pagedir, sizeof(pagedir1));
12128 +       toi_result = toi_header->param0;
12129 +       toi_bkd.toi_action = toi_header->param1;
12130 +       toi_bkd.toi_debug_state = toi_header->param2;
12131 +       toi_bkd.toi_default_console_level = toi_header->param3;
12132 +       clear_toi_state(TOI_IGNORE_LOGLEVEL);
12133 +       pagedir2.size = toi_header->pageset_2_size;
12134 +       for (i = 0; i < 4; i++)
12135 +               toi_bkd.toi_io_time[i/2][i%2] =
12136 +                       toi_header->io_time[i/2][i%2];
12137 +       boot_kernel_data_buffer = toi_header->bkd;
12138 +
12139 +       /* Read module configurations */
12140 +       result = read_module_configs();
12141 +       if (result) {
12142 +               pagedir1.size = pagedir2.size = 0;
12143 +               printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
12144 +                               "configurations.\n");
12145 +               clear_action_state(TOI_KEEP_IMAGE);
12146 +               goto out_remove_image;
12147 +       }
12148 +
12149 +       toi_prepare_console();
12150 +
12151 +       set_toi_state(TOI_NOW_RESUMING);
12152 +
12153 +       if (pre_resume_freeze())
12154 +               goto out_reset_console;
12155 +
12156 +       toi_cond_pause(1, "About to read original pageset1 locations.");
12157 +
12158 +       /*
12159 +        * Read original pageset1 locations. These are the addresses we can't
12160 +        * use for the data to be restored.
12161 +        */
12162 +
12163 +       if (allocate_dyn_pageflags(&pageset1_map, 0) ||
12164 +           allocate_dyn_pageflags(&pageset1_copy_map, 0) ||
12165 +           allocate_dyn_pageflags(&io_map, 0))
12166 +               goto out_reset_console;
12167 +
12168 +       if (load_dyn_pageflags(&pageset1_map))
12169 +               goto out_reset_console;
12170 +
12171 +       /* Clean up after reading the header */
12172 +       result = toiActiveAllocator->read_header_cleanup();
12173 +       if (result) {
12174 +               printk("TuxOnIce: Failed to cleanup after reading the image "
12175 +                               "header.\n");
12176 +               goto out_reset_console;
12177 +       }
12178 +
12179 +       toi_cond_pause(1, "About to read pagedir.");
12180 +
12181 +       /*
12182 +        * Get the addresses of pages into which we will load the kernel to
12183 +        * be copied back
12184 +        */
12185 +       if (toi_get_pageset1_load_addresses()) {
12186 +               printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
12187 +                               "pageset1.\n");
12188 +               goto out_reset_console;
12189 +       }
12190 +
12191 +       /* Read the original kernel back */
12192 +       toi_cond_pause(1, "About to read pageset 1.");
12193 +
12194 +       if (read_pageset(&pagedir1, 0)) {
12195 +               toi_prepare_status(CLEAR_BAR, "Failed to read pageset 1.");
12196 +               result = -EIO;
12197 +               printk(KERN_INFO "TuxOnIce: Failed to get load pageset1.\n");
12198 +               goto out_reset_console;
12199 +       }
12200 +
12201 +       toi_cond_pause(1, "About to restore original kernel.");
12202 +       result = 0;
12203 +
12204 +       if (!test_action_state(TOI_KEEP_IMAGE) &&
12205 +           toiActiveAllocator->mark_resume_attempted)
12206 +               toiActiveAllocator->mark_resume_attempted(1);
12207 +
12208 +out:
12209 +       toi_free_page(25, (unsigned long) header_buffer);
12210 +       return result;
12211 +
12212 +out_reset_console:
12213 +       toi_cleanup_console();
12214 +
12215 +out_remove_image:
12216 +       free_dyn_pageflags(&pageset1_map);
12217 +       free_dyn_pageflags(&pageset1_copy_map);
12218 +       free_dyn_pageflags(&io_map);
12219 +       result = -EINVAL;
12220 +       if (!test_action_state(TOI_KEEP_IMAGE))
12221 +               toiActiveAllocator->remove_image();
12222 +       toiActiveAllocator->read_header_cleanup();
12223 +       noresume_reset_modules();
12224 +       goto out;
12225 +}
12226 +
12227 +/* read_pageset1()
12228 + *
12229 + * Description:        Attempt to read the header and pageset1 of a hibernate image.
12230 + *             Handle the outcome, complaining where appropriate.
12231 + */
12232 +
12233 +int read_pageset1(void)
12234 +{
12235 +       int error;
12236 +
12237 +       error = __read_pageset1();
12238 +
12239 +       if (error && error != -ENODATA && error != -EINVAL &&
12240 +                                       !test_result_state(TOI_ABORTED))
12241 +               abort_hibernate(TOI_IMAGE_ERROR,
12242 +                       "TuxOnIce: Error %d resuming\n", error);
12243 +
12244 +       return error;
12245 +}
12246 +
12247 +/*
12248 + * get_have_image_data()
12249 + */
12250 +static char *get_have_image_data(void)
12251 +{
12252 +       char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
12253 +       struct toi_header *toi_header;
12254 +
12255 +       if (!output_buffer) {
12256 +               printk(KERN_INFO "Output buffer null.\n");
12257 +               return NULL;
12258 +       }
12259 +
12260 +       /* Check for an image */
12261 +       if (!toiActiveAllocator->image_exists(1) ||
12262 +           toiActiveAllocator->read_header_init() ||
12263 +           toiActiveAllocator->rw_header_chunk(READ, NULL,
12264 +                       output_buffer, sizeof(struct toi_header))) {
12265 +               sprintf(output_buffer, "0\n");
12266 +               /*
12267 +                * From an initrd/ramfs, catting have_image and
12268 +                * getting a result of 0 is sufficient.
12269 +                */
12270 +               clear_toi_state(TOI_BOOT_TIME);
12271 +               goto out;
12272 +       }
12273 +
12274 +       toi_header = (struct toi_header *) output_buffer;
12275 +
12276 +       sprintf(output_buffer, "1\n%s\n%s\n",
12277 +                       toi_header->uts.machine,
12278 +                       toi_header->uts.version);
12279 +
12280 +       /* Check whether we've resumed before */
12281 +       if (test_toi_state(TOI_RESUMED_BEFORE))
12282 +               strcat(output_buffer, "Resumed before.\n");
12283 +
12284 +out:
12285 +       noresume_reset_modules();
12286 +       return output_buffer;
12287 +}
12288 +
12289 +/* read_pageset2()
12290 + *
12291 + * Description:        Read in part or all of pageset2 of an image, depending upon
12292 + *             whether we are hibernating and have only overwritten a portion
12293 + *             with pageset1 pages, or are resuming and need to read them
12294 + *             all.
12295 + * Arguments:  Int. Boolean. Read only pages which would have been
12296 + *             overwritten by pageset1?
12297 + * Returns:    Int. Zero if no error, otherwise the error value.
12298 + */
12299 +int read_pageset2(int overwrittenpagesonly)
12300 +{
12301 +       int result = 0;
12302 +
12303 +       if (!pagedir2.size)
12304 +               return 0;
12305 +
12306 +       result = read_pageset(&pagedir2, overwrittenpagesonly);
12307 +
12308 +       toi_update_status(100, 100, NULL);
12309 +       toi_cond_pause(1, "Pagedir 2 read.");
12310 +
12311 +       return result;
12312 +}
12313 +
12314 +/* image_exists_read
12315 + *
12316 + * Return 0 or 1, depending on whether an image is found.
12317 + * Incoming buffer is PAGE_SIZE and result is guaranteed
12318 + * to be far less than that, so we don't worry about
12319 + * overflow.
12320 + */
12321 +int image_exists_read(const char *page, int count)
12322 +{
12323 +       int len = 0;
12324 +       char *result;
12325 +
12326 +       if (toi_activate_storage(0))
12327 +               return count;
12328 +
12329 +       if (!test_toi_state(TOI_RESUME_DEVICE_OK))
12330 +               toi_attempt_to_parse_resume_device(0);
12331 +
12332 +       if (!toiActiveAllocator) {
12333 +               len = sprintf((char *) page, "-1\n");
12334 +       } else {
12335 +               result = get_have_image_data();
12336 +               if (result) {
12337 +                       len = sprintf((char *) page, "%s",  result);
12338 +                       toi_free_page(26, (unsigned long) result);
12339 +               }
12340 +       }
12341 +
12342 +       toi_deactivate_storage(0);
12343 +
12344 +       return len;
12345 +}
12346 +
12347 +/* image_exists_write
12348 + *
12349 + * Invalidate an image if one exists.
12350 + */
12351 +int image_exists_write(const char *buffer, int count)
12352 +{
12353 +       if (toi_activate_storage(0))
12354 +               return count;
12355 +
12356 +       if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
12357 +               toiActiveAllocator->remove_image();
12358 +
12359 +       toi_deactivate_storage(0);
12360 +
12361 +       clear_result_state(TOI_KEPT_IMAGE);
12362 +
12363 +       return count;
12364 +}
12365 +
12366 +#ifdef CONFIG_TOI_EXPORTS
12367 +EXPORT_SYMBOL_GPL(toi_attempt_to_parse_resume_device);
12368 +EXPORT_SYMBOL_GPL(attempt_to_parse_resume_device2);
12369 +EXPORT_SYMBOL_GPL(toi_io_workers);
12370 +EXPORT_SYMBOL_GPL(toi_io_queue_flusher);
12371 +EXPORT_SYMBOL_GPL(toi_bio_queue_flusher_should_finish);
12372 +#endif
12373 +
12374 diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
12375 new file mode 100644
12376 index 0000000..d4b470b
12377 --- /dev/null
12378 +++ b/kernel/power/tuxonice_io.h
12379 @@ -0,0 +1,71 @@
12380 +/*
12381 + * kernel/power/tuxonice_io.h
12382 + *
12383 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
12384 + *
12385 + * This file is released under the GPLv2.
12386 + *
12387 + * It contains high level IO routines for hibernating.
12388 + *
12389 + */
12390 +
12391 +#include <linux/utsname.h>
12392 +#include "tuxonice_pagedir.h"
12393 +#include "power.h"
12394 +
12395 +/* Non-module data saved in our image header */
12396 +struct toi_header {
12397 +       /*
12398 +        * Mirror struct swsusp_info, but without
12399 +        * the page aligned attribute
12400 +        */
12401 +       struct new_utsname uts;
12402 +       u32 version_code;
12403 +       unsigned long num_physpages;
12404 +       int cpus;
12405 +       unsigned long image_pages;
12406 +       unsigned long pages;
12407 +       unsigned long size;
12408 +
12409 +       /* Our own data */
12410 +       unsigned long orig_mem_free;
12411 +       int page_size;
12412 +       int pageset_2_size;
12413 +       int param0;
12414 +       int param1;
12415 +       int param2;
12416 +       int param3;
12417 +       int progress0;
12418 +       int progress1;
12419 +       int progress2;
12420 +       int progress3;
12421 +       int io_time[2][2];
12422 +       struct pagedir pagedir;
12423 +       dev_t root_fs;
12424 +       unsigned long bkd; /* Boot kernel data locn */
12425 +};
12426 +
12427 +extern int write_pageset(struct pagedir *pagedir);
12428 +extern int write_image_header(void);
12429 +extern int read_pageset1(void);
12430 +extern int read_pageset2(int overwrittenpagesonly);
12431 +
12432 +extern int toi_attempt_to_parse_resume_device(int quiet);
12433 +extern void attempt_to_parse_resume_device2(void);
12434 +extern void attempt_to_parse_alt_resume_param(void);
12435 +int image_exists_read(const char *page, int count);
12436 +int image_exists_write(const char *buffer, int count);
12437 +extern void save_restore_alt_param(int replace, int quiet);
12438 +extern atomic_t toi_io_workers;
12439 +
12440 +/* Args to save_restore_alt_param */
12441 +#define RESTORE 0
12442 +#define SAVE 1
12443 +
12444 +#define NOQUIET 0
12445 +#define QUIET 1
12446 +
12447 +extern dev_t name_to_dev_t(char *line);
12448 +
12449 +extern wait_queue_head_t toi_io_queue_flusher;
12450 +extern int toi_bio_queue_flusher_should_finish;
12451 diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
12452 new file mode 100644
12453 index 0000000..95ce455
12454 --- /dev/null
12455 +++ b/kernel/power/tuxonice_modules.c
12456 @@ -0,0 +1,461 @@
12457 +/*
12458 + * kernel/power/tuxonice_modules.c
12459 + *
12460 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
12461 + *
12462 + */
12463 +
12464 +#include <linux/suspend.h>
12465 +#include <linux/module.h>
12466 +#include "tuxonice.h"
12467 +#include "tuxonice_modules.h"
12468 +#include "tuxonice_sysfs.h"
12469 +#include "tuxonice_ui.h"
12470 +
12471 +LIST_HEAD(toi_filters);
12472 +LIST_HEAD(toiAllocators);
12473 +LIST_HEAD(toi_modules);
12474 +
12475 +struct toi_module_ops *toiActiveAllocator;
12476 +int toi_num_filters;
12477 +int toiNumAllocators, toi_num_modules;
12478 +
12479 +/*
12480 + * toi_header_storage_for_modules
12481 + *
12482 + * Returns the amount of space needed to store configuration
12483 + * data needed by the modules prior to copying back the original
12484 + * kernel. We can exclude data for pageset2 because it will be
12485 + * available anyway once the kernel is copied back.
12486 + */
12487 +long toi_header_storage_for_modules(void)
12488 +{
12489 +       struct toi_module_ops *this_module;
12490 +       int bytes = 0;
12491 +
12492 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12493 +               if (!this_module->enabled ||
12494 +                   (this_module->type == WRITER_MODULE &&
12495 +                    toiActiveAllocator != this_module))
12496 +                       continue;
12497 +               if (this_module->storage_needed) {
12498 +                       int this = this_module->storage_needed() +
12499 +                               sizeof(struct toi_module_header) +
12500 +                               sizeof(int);
12501 +                       this_module->header_requested = this;
12502 +                       bytes += this;
12503 +               }
12504 +       }
12505 +
12506 +       /* One more for the empty terminator */
12507 +       return bytes + sizeof(struct toi_module_header);
12508 +}
12509 +
12510 +/*
12511 + * toi_memory_for_modules
12512 + *
12513 + * Returns the amount of memory requested by modules for
12514 + * doing their work during the cycle.
12515 + */
12516 +
12517 +long toi_memory_for_modules(int print_parts)
12518 +{
12519 +       long bytes = 0, result;
12520 +       struct toi_module_ops *this_module;
12521 +
12522 +       if (print_parts)
12523 +               printk(KERN_INFO "Memory for modules:\n===================\n");
12524 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12525 +               int this;
12526 +               if (!this_module->enabled)
12527 +                       continue;
12528 +               if (this_module->memory_needed) {
12529 +                       this = this_module->memory_needed();
12530 +                       if (print_parts)
12531 +                               printk(KERN_INFO "%10d bytes (%5ld pages) for "
12532 +                                               "module '%s'.\n", this,
12533 +                                               DIV_ROUND_UP(this, PAGE_SIZE),
12534 +                                               this_module->name);
12535 +                       bytes += this;
12536 +               }
12537 +       }
12538 +
12539 +       result = DIV_ROUND_UP(bytes, PAGE_SIZE);
12540 +       if (print_parts)
12541 +               printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
12542 +
12543 +       return result;
12544 +}
12545 +
12546 +/*
12547 + * toi_expected_compression_ratio
12548 + *
12549 + * Returns the compression ratio expected when saving the image.
12550 + */
12551 +
12552 +int toi_expected_compression_ratio(void)
12553 +{
12554 +       int ratio = 100;
12555 +       struct toi_module_ops *this_module;
12556 +
12557 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12558 +               if (!this_module->enabled)
12559 +                       continue;
12560 +               if (this_module->expected_compression)
12561 +                       ratio = ratio * this_module->expected_compression()
12562 +                               / 100;
12563 +       }
12564 +
12565 +       return ratio;
12566 +}
12567 +
12568 +/* toi_find_module_given_dir
12569 + * Functionality :     Return a module (if found), given a pointer
12570 + *                     to its directory name
12571 + */
12572 +
12573 +static struct toi_module_ops *toi_find_module_given_dir(char *name)
12574 +{
12575 +       struct toi_module_ops *this_module, *found_module = NULL;
12576 +
12577 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12578 +               if (!strcmp(name, this_module->directory)) {
12579 +                       found_module = this_module;
12580 +                       break;
12581 +               }
12582 +       }
12583 +
12584 +       return found_module;
12585 +}
12586 +
12587 +/* toi_find_module_given_name
12588 + * Functionality :     Return a module (if found), given a pointer
12589 + *                     to its name
12590 + */
12591 +
12592 +struct toi_module_ops *toi_find_module_given_name(char *name)
12593 +{
12594 +       struct toi_module_ops *this_module, *found_module = NULL;
12595 +
12596 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12597 +               if (!strcmp(name, this_module->name)) {
12598 +                       found_module = this_module;
12599 +                       break;
12600 +               }
12601 +       }
12602 +
12603 +       return found_module;
12604 +}
12605 +
12606 +/*
12607 + * toi_print_module_debug_info
12608 + * Functionality   : Get debugging info from modules into a buffer.
12609 + */
12610 +int toi_print_module_debug_info(char *buffer, int buffer_size)
12611 +{
12612 +       struct toi_module_ops *this_module;
12613 +       int len = 0;
12614 +
12615 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12616 +               if (!this_module->enabled)
12617 +                       continue;
12618 +               if (this_module->print_debug_info) {
12619 +                       int result;
12620 +                       result = this_module->print_debug_info(buffer + len,
12621 +                                       buffer_size - len);
12622 +                       len += result;
12623 +               }
12624 +       }
12625 +
12626 +       /* Ensure null terminated */
12627 +       buffer[buffer_size] = 0;
12628 +
12629 +       return len;
12630 +}
12631 +
12632 +/*
12633 + * toi_register_module
12634 + *
12635 + * Register a module.
12636 + */
12637 +int toi_register_module(struct toi_module_ops *module)
12638 +{
12639 +       int i;
12640 +       struct kobject *kobj;
12641 +
12642 +       module->enabled = 1;
12643 +
12644 +       if (toi_find_module_given_name(module->name)) {
12645 +               printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
12646 +                               " which is already registered.\n",
12647 +                               module->name);
12648 +               return -EBUSY;
12649 +       }
12650 +
12651 +       switch (module->type) {
12652 +       case FILTER_MODULE:
12653 +               list_add_tail(&module->type_list, &toi_filters);
12654 +               toi_num_filters++;
12655 +               break;
12656 +       case WRITER_MODULE:
12657 +               list_add_tail(&module->type_list, &toiAllocators);
12658 +               toiNumAllocators++;
12659 +               break;
12660 +       case MISC_MODULE:
12661 +       case MISC_HIDDEN_MODULE:
12662 +               break;
12663 +       default:
12664 +               printk("Hmmm. Module '%s' has an invalid type."
12665 +                       " It has been ignored.\n", module->name);
12666 +               return -EINVAL;
12667 +       }
12668 +       list_add_tail(&module->module_list, &toi_modules);
12669 +       toi_num_modules++;
12670 +
12671 +       if (!module->directory && !module->shared_directory)
12672 +               return 0;
12673 +
12674 +       /*
12675 +        * Modules may share a directory, but those with shared_dir
12676 +        * set must be loaded (via symbol dependencies) after parents
12677 +        * and unloaded beforehand.
12678 +        */
12679 +       if (module->shared_directory) {
12680 +               struct toi_module_ops *shared =
12681 +                       toi_find_module_given_dir(module->shared_directory);
12682 +               if (!shared) {
12683 +                       printk("TuxOnIce: Module %s wants to share %s's "
12684 +                                       "directory but %s isn't loaded.\n",
12685 +                                       module->name, module->shared_directory,
12686 +                                       module->shared_directory);
12687 +                       toi_unregister_module(module);
12688 +                       return -ENODEV;
12689 +               }
12690 +               kobj = shared->dir_kobj;
12691 +       } else {
12692 +               if (!strncmp(module->directory, "[ROOT]", 6))
12693 +                       kobj = tuxonice_kobj;
12694 +               else
12695 +                       kobj = make_toi_sysdir(module->directory);
12696 +       }
12697 +       module->dir_kobj = kobj;
12698 +       for (i = 0; i < module->num_sysfs_entries; i++) {
12699 +               int result = toi_register_sysfs_file(kobj,
12700 +                               &module->sysfs_data[i]);
12701 +               if (result)
12702 +                       return result;
12703 +       }
12704 +       return 0;
12705 +}
12706 +
12707 +/*
12708 + * toi_unregister_module
12709 + *
12710 + * Remove a module.
12711 + */
12712 +void toi_unregister_module(struct toi_module_ops *module)
12713 +{
12714 +       int i;
12715 +
12716 +       if (module->dir_kobj)
12717 +               for (i = 0; i < module->num_sysfs_entries; i++)
12718 +                       toi_unregister_sysfs_file(module->dir_kobj,
12719 +                                       &module->sysfs_data[i]);
12720 +
12721 +       if (!module->shared_directory && module->directory &&
12722 +                       strncmp(module->directory, "[ROOT]", 6))
12723 +               remove_toi_sysdir(module->dir_kobj);
12724 +
12725 +       switch (module->type) {
12726 +       case FILTER_MODULE:
12727 +               list_del(&module->type_list);
12728 +               toi_num_filters--;
12729 +               break;
12730 +       case WRITER_MODULE:
12731 +               list_del(&module->type_list);
12732 +               toiNumAllocators--;
12733 +               if (toiActiveAllocator == module) {
12734 +                       toiActiveAllocator = NULL;
12735 +                       clear_toi_state(TOI_CAN_RESUME);
12736 +                       clear_toi_state(TOI_CAN_HIBERNATE);
12737 +               }
12738 +               break;
12739 +       case MISC_MODULE:
12740 +       case MISC_HIDDEN_MODULE:
12741 +               break;
12742 +       default:
12743 +               printk("Hmmm. Module '%s' has an invalid type."
12744 +                       " It has been ignored.\n", module->name);
12745 +               return;
12746 +       }
12747 +       list_del(&module->module_list);
12748 +       toi_num_modules--;
12749 +}
12750 +
12751 +/*
12752 + * toi_move_module_tail
12753 + *
12754 + * Rearrange modules when reloading the config.
12755 + */
12756 +void toi_move_module_tail(struct toi_module_ops *module)
12757 +{
12758 +       switch (module->type) {
12759 +       case FILTER_MODULE:
12760 +               if (toi_num_filters > 1)
12761 +                       list_move_tail(&module->type_list, &toi_filters);
12762 +               break;
12763 +       case WRITER_MODULE:
12764 +               if (toiNumAllocators > 1)
12765 +                       list_move_tail(&module->type_list, &toiAllocators);
12766 +               break;
12767 +       case MISC_MODULE:
12768 +       case MISC_HIDDEN_MODULE:
12769 +               break;
12770 +       default:
12771 +               printk("Hmmm. Module '%s' has an invalid type."
12772 +                       " It has been ignored.\n", module->name);
12773 +               return;
12774 +       }
12775 +       if ((toi_num_filters + toiNumAllocators) > 1)
12776 +               list_move_tail(&module->module_list, &toi_modules);
12777 +}
12778 +
12779 +/*
12780 + * toi_initialise_modules
12781 + *
12782 + * Get ready to do some work!
12783 + */
12784 +int toi_initialise_modules(int starting_cycle, int early)
12785 +{
12786 +       struct toi_module_ops *this_module;
12787 +       int result;
12788 +
12789 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12790 +               this_module->header_requested = 0;
12791 +               this_module->header_used = 0;
12792 +               if (!this_module->enabled)
12793 +                       continue;
12794 +               if (this_module->early != early)
12795 +                       continue;
12796 +               if (this_module->initialise) {
12797 +                       toi_message(TOI_MEMORY, TOI_MEDIUM, 1,
12798 +                               "Initialising module %s.\n",
12799 +                               this_module->name);
12800 +                       result = this_module->initialise(starting_cycle);
12801 +                       if (result)
12802 +                               return result;
12803 +               }
12804 +       }
12805 +
12806 +       return 0;
12807 +}
12808 +
12809 +/*
12810 + * toi_cleanup_modules
12811 + *
12812 + * Tell modules the work is done.
12813 + */
12814 +void toi_cleanup_modules(int finishing_cycle)
12815 +{
12816 +       struct toi_module_ops *this_module;
12817 +
12818 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12819 +               if (!this_module->enabled)
12820 +                       continue;
12821 +               if (this_module->cleanup) {
12822 +                       toi_message(TOI_MEMORY, TOI_MEDIUM, 1,
12823 +                               "Cleaning up module %s.\n",
12824 +                               this_module->name);
12825 +                       this_module->cleanup(finishing_cycle);
12826 +               }
12827 +       }
12828 +}
12829 +
12830 +/*
12831 + * toi_get_next_filter
12832 + *
12833 + * Get the next filter in the pipeline.
12834 + */
12835 +struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
12836 +{
12837 +       struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
12838 +
12839 +       list_for_each_entry(this_filter, &toi_filters, type_list) {
12840 +               if (!this_filter->enabled)
12841 +                       continue;
12842 +               if ((last_filter == filter_sought) || (!filter_sought))
12843 +                       return this_filter;
12844 +               last_filter = this_filter;
12845 +       }
12846 +
12847 +       return toiActiveAllocator;
12848 +}
12849 +
12850 +/**
12851 + * toi_show_modules: Printk what support is loaded.
12852 + */
12853 +void toi_print_modules(void)
12854 +{
12855 +       struct toi_module_ops *this_module;
12856 +       int prev = 0;
12857 +
12858 +       printk("TuxOnIce " TOI_CORE_VERSION ", with support for");
12859 +
12860 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12861 +               if (this_module->type == MISC_HIDDEN_MODULE)
12862 +                       continue;
12863 +               printk("%s %s%s%s", prev ? "," : "",
12864 +                               this_module->enabled ? "" : "[",
12865 +                               this_module->name,
12866 +                               this_module->enabled ? "" : "]");
12867 +               prev = 1;
12868 +       }
12869 +
12870 +       printk(".\n");
12871 +}
12872 +
12873 +/* toi_get_modules
12874 + *
12875 + * Take a reference to modules so they can't go away under us.
12876 + */
12877 +
12878 +int toi_get_modules(void)
12879 +{
12880 +       struct toi_module_ops *this_module;
12881 +
12882 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12883 +               struct toi_module_ops *this_module2;
12884 +
12885 +               if (try_module_get(this_module->module))
12886 +                       continue;
12887 +
12888 +               /* Failed! Reverse gets and return error */
12889 +               list_for_each_entry(this_module2, &toi_modules,
12890 +                               module_list) {
12891 +                       if (this_module == this_module2)
12892 +                               return -EINVAL;
12893 +                       module_put(this_module2->module);
12894 +               }
12895 +       }
12896 +       return 0;
12897 +}
12898 +
12899 +/* toi_put_modules
12900 + *
12901 + * Release our references to modules we used.
12902 + */
12903 +
12904 +void toi_put_modules(void)
12905 +{
12906 +       struct toi_module_ops *this_module;
12907 +
12908 +       list_for_each_entry(this_module, &toi_modules, module_list)
12909 +               module_put(this_module->module);
12910 +}
12911 +
12912 +#ifdef CONFIG_TOI_EXPORTS
12913 +EXPORT_SYMBOL_GPL(toi_register_module);
12914 +EXPORT_SYMBOL_GPL(toi_unregister_module);
12915 +EXPORT_SYMBOL_GPL(toi_get_next_filter);
12916 +EXPORT_SYMBOL_GPL(toiActiveAllocator);
12917 +#endif
12918 diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
12919 new file mode 100644
12920 index 0000000..8397ddf
12921 --- /dev/null
12922 +++ b/kernel/power/tuxonice_modules.h
12923 @@ -0,0 +1,176 @@
12924 +/*
12925 + * kernel/power/tuxonice_modules.h
12926 + *
12927 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
12928 + *
12929 + * This file is released under the GPLv2.
12930 + *
12931 + * It contains declarations for modules. Modules are additions to
12932 + * TuxOnIce that provide facilities such as image compression or
12933 + * encryption, backends for storage of the image and user interfaces.
12934 + *
12935 + */
12936 +
12937 +#ifndef TOI_MODULES_H
12938 +#define TOI_MODULES_H
12939 +
12940 +/* This is the maximum size we store in the image header for a module name */
12941 +#define TOI_MAX_MODULE_NAME_LENGTH 30
12942 +
12943 +/* Per-module metadata */
12944 +struct toi_module_header {
12945 +       char name[TOI_MAX_MODULE_NAME_LENGTH];
12946 +       int enabled;
12947 +       int type;
12948 +       int index;
12949 +       int data_length;
12950 +       unsigned long signature;
12951 +};
12952 +
12953 +enum {
12954 +       FILTER_MODULE,
12955 +       WRITER_MODULE,
12956 +       MISC_MODULE, /* Block writer, eg. */
12957 +       MISC_HIDDEN_MODULE,
12958 +};
12959 +
12960 +enum {
12961 +       TOI_ASYNC,
12962 +       TOI_SYNC
12963 +};
12964 +
12965 +struct toi_module_ops {
12966 +       /* Functions common to all modules */
12967 +       int type;
12968 +       char *name;
12969 +       char *directory;
12970 +       char *shared_directory;
12971 +       struct kobject *dir_kobj;
12972 +       struct module *module;
12973 +       int enabled, early;
12974 +       struct list_head module_list;
12975 +
12976 +       /* List of filters or allocators */
12977 +       struct list_head list, type_list;
12978 +
12979 +       /*
12980 +        * Requirements for memory and storage in
12981 +        * the image header..
12982 +        */
12983 +       int (*memory_needed) (void);
12984 +       int (*storage_needed) (void);
12985 +
12986 +       int header_requested, header_used;
12987 +
12988 +       int (*expected_compression) (void);
12989 +
12990 +       /*
12991 +        * Debug info
12992 +        */
12993 +       int (*print_debug_info) (char *buffer, int size);
12994 +       int (*save_config_info) (char *buffer);
12995 +       void (*load_config_info) (char *buffer, int len);
12996 +
12997 +       /*
12998 +        * Initialise & cleanup - general routines called
12999 +        * at the start and end of a cycle.
13000 +        */
13001 +       int (*initialise) (int starting_cycle);
13002 +       void (*cleanup) (int finishing_cycle);
13003 +
13004 +       /*
13005 +        * Calls for allocating storage (allocators only).
13006 +        *
13007 +        * Header space is allocated separately. Note that allocation
13008 +        * of space for the header might result in allocated space
13009 +        * being stolen from the main pool if there is no unallocated
13010 +        * space. We have to be able to allocate enough space for
13011 +        * the header. We can eat memory to ensure there is enough
13012 +        * for the main pool.
13013 +        */
13014 +
13015 +       int (*storage_available) (void);
13016 +       void (*reserve_header_space) (int space_requested);
13017 +       int (*allocate_storage) (int space_requested);
13018 +       int (*storage_allocated) (void);
13019 +       int (*release_storage) (void);
13020 +
13021 +       /*
13022 +        * Routines used in image I/O.
13023 +        */
13024 +       int (*rw_init) (int rw, int stream_number);
13025 +       int (*rw_cleanup) (int rw);
13026 +       int (*write_page) (unsigned long index, struct page *buffer_page,
13027 +                       unsigned int buf_size);
13028 +       int (*read_page) (unsigned long *index, struct page *buffer_page,
13029 +                       unsigned int *buf_size);
13030 +       void (*io_flusher) (int rw);
13031 +
13032 +       /* Reset module if image exists but reading aborted */
13033 +       void (*noresume_reset) (void);
13034 +
13035 +       /* Read and write the metadata */
13036 +       int (*write_header_init) (void);
13037 +       int (*write_header_cleanup) (void);
13038 +
13039 +       int (*read_header_init) (void);
13040 +       int (*read_header_cleanup) (void);
13041 +
13042 +       int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
13043 +                       char *buffer_start, int buffer_size);
13044 +
13045 +       int (*rw_header_chunk_noreadahead) (int rw,
13046 +                       struct toi_module_ops *owner, char *buffer_start,
13047 +                       int buffer_size);
13048 +
13049 +       /* Attempt to parse an image location */
13050 +       int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
13051 +
13052 +       /* Determine whether image exists that we can restore */
13053 +       int (*image_exists) (int quiet);
13054 +
13055 +       /* Mark the image as having tried to resume */
13056 +       int (*mark_resume_attempted) (int);
13057 +
13058 +       /* Destroy image if one exists */
13059 +       int (*remove_image) (void);
13060 +
13061 +       /* Sysfs Data */
13062 +       struct toi_sysfs_data *sysfs_data;
13063 +       int num_sysfs_entries;
13064 +};
13065 +
13066 +extern int toi_num_modules, toiNumAllocators;
13067 +
13068 +extern struct toi_module_ops *toiActiveAllocator;
13069 +extern struct list_head toi_filters, toiAllocators, toi_modules;
13070 +
13071 +extern void toi_prepare_console_modules(void);
13072 +extern void toi_cleanup_console_modules(void);
13073 +
13074 +extern struct toi_module_ops *toi_find_module_given_name(char *name);
13075 +extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
13076 +
13077 +extern int toi_register_module(struct toi_module_ops *module);
13078 +extern void toi_move_module_tail(struct toi_module_ops *module);
13079 +
13080 +extern long toi_header_storage_for_modules(void);
13081 +extern long toi_memory_for_modules(int print_parts);
13082 +extern int toi_expected_compression_ratio(void);
13083 +
13084 +extern int toi_print_module_debug_info(char *buffer, int buffer_size);
13085 +extern int toi_register_module(struct toi_module_ops *module);
13086 +extern void toi_unregister_module(struct toi_module_ops *module);
13087 +
13088 +extern int toi_initialise_modules(int starting_cycle, int early);
13089 +#define toi_initialise_modules_early(starting) \
13090 +       toi_initialise_modules(starting, 1)
13091 +#define toi_initialise_modules_late(starting) \
13092 +       toi_initialise_modules(starting, 0)
13093 +extern void toi_cleanup_modules(int finishing_cycle);
13094 +
13095 +extern void toi_print_modules(void);
13096 +
13097 +int toi_get_modules(void);
13098 +void toi_put_modules(void);
13099 +#endif
13100 diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
13101 new file mode 100644
13102 index 0000000..f459a7f
13103 --- /dev/null
13104 +++ b/kernel/power/tuxonice_netlink.c
13105 @@ -0,0 +1,323 @@
13106 +/*
13107 + * kernel/power/tuxonice_netlink.c
13108 + *
13109 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
13110 + *
13111 + * This file is released under the GPLv2.
13112 + *
13113 + * Functions for communicating with a userspace helper via netlink.
13114 + */
13115 +
13116 +
13117 +#include <linux/suspend.h>
13118 +#include "tuxonice_netlink.h"
13119 +#include "tuxonice.h"
13120 +#include "tuxonice_modules.h"
13121 +#include "tuxonice_alloc.h"
13122 +
13123 +struct user_helper_data *uhd_list;
13124 +
13125 +/*
13126 + * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
13127 + * none can be allocated).
13128 + */
13129 +static void toi_fill_skb_pool(struct user_helper_data *uhd)
13130 +{
13131 +       while (uhd->pool_level < uhd->pool_limit) {
13132 +               struct sk_buff *new_skb =
13133 +                       alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
13134 +
13135 +               if (!new_skb)
13136 +                       break;
13137 +
13138 +               new_skb->next = uhd->emerg_skbs;
13139 +               uhd->emerg_skbs = new_skb;
13140 +               uhd->pool_level++;
13141 +       }
13142 +}
13143 +
13144 +/*
13145 + * Try to allocate a single skb. If we can't get one, try to use one from
13146 + * our pool.
13147 + */
13148 +static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
13149 +{
13150 +       struct sk_buff *skb =
13151 +               alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
13152 +
13153 +       if (skb)
13154 +               return skb;
13155 +
13156 +       skb = uhd->emerg_skbs;
13157 +       if (skb) {
13158 +               uhd->pool_level--;
13159 +               uhd->emerg_skbs = skb->next;
13160 +               skb->next = NULL;
13161 +       }
13162 +
13163 +       return skb;
13164 +}
13165 +
13166 +static void put_skb(struct user_helper_data *uhd, struct sk_buff *skb)
13167 +{
13168 +       if (uhd->pool_level < uhd->pool_limit) {
13169 +               skb->next = uhd->emerg_skbs;
13170 +               uhd->emerg_skbs = skb;
13171 +       } else
13172 +               kfree_skb(skb);
13173 +}
13174 +
13175 +void toi_send_netlink_message(struct user_helper_data *uhd,
13176 +               int type, void *params, size_t len)
13177 +{
13178 +       struct sk_buff *skb;
13179 +       struct nlmsghdr *nlh;
13180 +       void *dest;
13181 +       struct task_struct *t;
13182 +
13183 +       if (uhd->pid == -1)
13184 +               return;
13185 +
13186 +       skb = toi_get_skb(uhd);
13187 +       if (!skb) {
13188 +               printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
13189 +               return;
13190 +       }
13191 +
13192 +       /* NLMSG_PUT contains a hidden goto nlmsg_failure */
13193 +       nlh = NLMSG_PUT(skb, 0, uhd->sock_seq, type, len);
13194 +       uhd->sock_seq++;
13195 +
13196 +       dest = NLMSG_DATA(nlh);
13197 +       if (params && len > 0)
13198 +               memcpy(dest, params, len);
13199 +
13200 +       netlink_unicast(uhd->nl, skb, uhd->pid, 0);
13201 +
13202 +       read_lock(&tasklist_lock);
13203 +       t = find_task_by_pid(uhd->pid);
13204 +       if (!t) {
13205 +               read_unlock(&tasklist_lock);
13206 +               if (uhd->pid > -1)
13207 +                       printk(KERN_INFO "Hmm. Can't find the userspace task"
13208 +                               " %d.\n", uhd->pid);
13209 +               return;
13210 +       }
13211 +       wake_up_process(t);
13212 +       read_unlock(&tasklist_lock);
13213 +
13214 +       yield();
13215 +
13216 +       return;
13217 +
13218 +nlmsg_failure:
13219 +       if (skb)
13220 +               put_skb(uhd, skb);
13221 +}
13222 +EXPORT_SYMBOL_GPL(toi_send_netlink_message);
13223 +
13224 +static void send_whether_debugging(struct user_helper_data *uhd)
13225 +{
13226 +       static int is_debugging = 1;
13227 +
13228 +       toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
13229 +                       &is_debugging, sizeof(int));
13230 +}
13231 +
13232 +/*
13233 + * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
13234 + * are hibernating.
13235 + */
13236 +static int nl_set_nofreeze(struct user_helper_data *uhd, int pid)
13237 +{
13238 +       struct task_struct *t;
13239 +
13240 +       read_lock(&tasklist_lock);
13241 +       t = find_task_by_pid(pid);
13242 +       if (!t) {
13243 +               read_unlock(&tasklist_lock);
13244 +               printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
13245 +                               pid);
13246 +               return -EINVAL;
13247 +       }
13248 +
13249 +       t->flags |= PF_NOFREEZE;
13250 +
13251 +       read_unlock(&tasklist_lock);
13252 +       uhd->pid = pid;
13253 +
13254 +       toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
13255 +
13256 +       return 0;
13257 +}
13258 +
13259 +/*
13260 + * Called when the userspace process has informed us that it's ready to roll.
13261 + */
13262 +static int nl_ready(struct user_helper_data *uhd, int version)
13263 +{
13264 +       if (version != uhd->interface_version) {
13265 +               printk(KERN_INFO "%s userspace process using invalid interface"
13266 +                               " version. Trying to continue without it.\n",
13267 +                               uhd->name);
13268 +               if (uhd->not_ready)
13269 +                       uhd->not_ready();
13270 +               return -EINVAL;
13271 +       }
13272 +
13273 +       complete(&uhd->wait_for_process);
13274 +
13275 +       return 0;
13276 +}
13277 +
13278 +void toi_netlink_close_complete(struct user_helper_data *uhd)
13279 +{
13280 +       if (uhd->nl) {
13281 +               sock_release(uhd->nl->sk_socket);
13282 +               uhd->nl = NULL;
13283 +       }
13284 +
13285 +       while (uhd->emerg_skbs) {
13286 +               struct sk_buff *next = uhd->emerg_skbs->next;
13287 +               kfree_skb(uhd->emerg_skbs);
13288 +               uhd->emerg_skbs = next;
13289 +       }
13290 +
13291 +       uhd->pid = -1;
13292 +}
13293 +
13294 +static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
13295 +               struct sk_buff *skb, struct nlmsghdr *nlh)
13296 +{
13297 +       int type;
13298 +       int *data;
13299 +       int err;
13300 +
13301 +       /* Let the more specific handler go first. It returns
13302 +        * 1 for valid messages that it doesn't know. */
13303 +       err = uhd->rcv_msg(skb, nlh);
13304 +       if (err != 1)
13305 +               return err;
13306 +
13307 +       type = nlh->nlmsg_type;
13308 +
13309 +       /* Only allow one task to receive NOFREEZE privileges */
13310 +       if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
13311 +               printk("Received extra nofreeze me requests.\n");
13312 +               return -EBUSY;
13313 +       }
13314 +
13315 +       data = (int *)NLMSG_DATA(nlh);
13316 +
13317 +       switch (type) {
13318 +       case NETLINK_MSG_NOFREEZE_ME:
13319 +               return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
13320 +       case NETLINK_MSG_GET_DEBUGGING:
13321 +               send_whether_debugging(uhd);
13322 +               return 0;
13323 +       case NETLINK_MSG_READY:
13324 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) {
13325 +                       printk(KERN_INFO "Invalid ready mesage.\n");
13326 +                       return -EINVAL;
13327 +               }
13328 +               return nl_ready(uhd, *data);
13329 +       case NETLINK_MSG_CLEANUP:
13330 +               toi_netlink_close_complete(uhd);
13331 +               return 0;
13332 +       }
13333 +
13334 +       return -EINVAL;
13335 +}
13336 +
13337 +static void toi_user_rcv_skb(struct sk_buff *skb)
13338 +{
13339 +       int err;
13340 +       struct nlmsghdr *nlh;
13341 +       struct user_helper_data *uhd = uhd_list;
13342 +
13343 +       while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
13344 +               uhd = uhd->next;
13345 +
13346 +       if (!uhd)
13347 +               return;
13348 +
13349 +       while (skb->len >= NLMSG_SPACE(0)) {
13350 +               u32 rlen;
13351 +
13352 +               nlh = (struct nlmsghdr *) skb->data;
13353 +               if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
13354 +                       return;
13355 +
13356 +               rlen = NLMSG_ALIGN(nlh->nlmsg_len);
13357 +               if (rlen > skb->len)
13358 +                       rlen = skb->len;
13359 +
13360 +               err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
13361 +               if (err)
13362 +                       netlink_ack(skb, nlh, err);
13363 +               else if (nlh->nlmsg_flags & NLM_F_ACK)
13364 +                       netlink_ack(skb, nlh, 0);
13365 +               skb_pull(skb, rlen);
13366 +       }
13367 +}
13368 +
13369 +static int netlink_prepare(struct user_helper_data *uhd)
13370 +{
13371 +       uhd->next = uhd_list;
13372 +       uhd_list = uhd;
13373 +
13374 +       uhd->sock_seq = 0x42c0ffee;
13375 +       uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, 0,
13376 +                       toi_user_rcv_skb, NULL, THIS_MODULE);
13377 +       if (!uhd->nl) {
13378 +               printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
13379 +                               uhd->name);
13380 +               return -ENOMEM;
13381 +       }
13382 +
13383 +       toi_fill_skb_pool(uhd);
13384 +
13385 +       return 0;
13386 +}
13387 +
13388 +void toi_netlink_close(struct user_helper_data *uhd)
13389 +{
13390 +       struct task_struct *t;
13391 +
13392 +       read_lock(&tasklist_lock);
13393 +       t = find_task_by_pid(uhd->pid);
13394 +       if (t)
13395 +               t->flags &= ~PF_NOFREEZE;
13396 +       read_unlock(&tasklist_lock);
13397 +
13398 +       toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
13399 +}
13400 +EXPORT_SYMBOL_GPL(toi_netlink_close);
13401 +
13402 +int toi_netlink_setup(struct user_helper_data *uhd)
13403 +{
13404 +       if (netlink_prepare(uhd) < 0) {
13405 +               printk(KERN_INFO "Netlink prepare failed.\n");
13406 +               return 1;
13407 +       }
13408 +
13409 +       if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
13410 +                               UMH_WAIT_EXEC) < 0) {
13411 +               printk(KERN_INFO "Launch userspace program failed.\n");
13412 +               toi_netlink_close_complete(uhd);
13413 +               return 1;
13414 +       }
13415 +
13416 +       /* Wait 2 seconds for the userspace process to make contact */
13417 +       wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
13418 +
13419 +       if (uhd->pid == -1) {
13420 +               printk(KERN_INFO "%s: Failed to contact userspace process.\n",
13421 +                               uhd->name);
13422 +               toi_netlink_close_complete(uhd);
13423 +               return 1;
13424 +       }
13425 +
13426 +       return 0;
13427 +}
13428 +EXPORT_SYMBOL_GPL(toi_netlink_setup);
13429 diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
13430 new file mode 100644
13431 index 0000000..721c222
13432 --- /dev/null
13433 +++ b/kernel/power/tuxonice_netlink.h
13434 @@ -0,0 +1,58 @@
13435 +/*
13436 + * kernel/power/tuxonice_netlink.h
13437 + *
13438 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
13439 + *
13440 + * This file is released under the GPLv2.
13441 + *
13442 + * Declarations for functions for communicating with a userspace helper
13443 + * via netlink.
13444 + */
13445 +
13446 +#include <linux/netlink.h>
13447 +#include <net/sock.h>
13448 +
13449 +#define NETLINK_MSG_BASE 0x10
13450 +
13451 +#define NETLINK_MSG_READY 0x10
13452 +#define        NETLINK_MSG_NOFREEZE_ME 0x16
13453 +#define NETLINK_MSG_GET_DEBUGGING 0x19
13454 +#define NETLINK_MSG_CLEANUP 0x24
13455 +#define NETLINK_MSG_NOFREEZE_ACK 0x27
13456 +#define NETLINK_MSG_IS_DEBUGGING 0x28
13457 +
13458 +struct user_helper_data {
13459 +       int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
13460 +       void (*not_ready) (void);
13461 +       struct sock *nl;
13462 +       u32 sock_seq;
13463 +       pid_t pid;
13464 +       char *comm;
13465 +       char program[256];
13466 +       int pool_level;
13467 +       int pool_limit;
13468 +       struct sk_buff *emerg_skbs;
13469 +       int skb_size;
13470 +       int netlink_id;
13471 +       char *name;
13472 +       struct user_helper_data *next;
13473 +       struct completion wait_for_process;
13474 +       int interface_version;
13475 +       int must_init;
13476 +};
13477 +
13478 +#ifdef CONFIG_NET
13479 +int toi_netlink_setup(struct user_helper_data *uhd);
13480 +void toi_netlink_close(struct user_helper_data *uhd);
13481 +void toi_send_netlink_message(struct user_helper_data *uhd,
13482 +               int type, void *params, size_t len);
13483 +#else
13484 +static inline int toi_netlink_setup(struct user_helper_data *uhd)
13485 +{
13486 +       return 0;
13487 +}
13488 +
13489 +static inline void toi_netlink_close(struct user_helper_data *uhd) { };
13490 +static inline void toi_send_netlink_message(struct user_helper_data *uhd,
13491 +               int type, void *params, size_t len) { };
13492 +#endif
13493 diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
13494 new file mode 100644
13495 index 0000000..93b65cd
13496 --- /dev/null
13497 +++ b/kernel/power/tuxonice_pagedir.c
13498 @@ -0,0 +1,347 @@
13499 +/*
13500 + * kernel/power/tuxonice_pagedir.c
13501 + *
13502 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
13503 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
13504 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
13505 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
13506 + *
13507 + * This file is released under the GPLv2.
13508 + *
13509 + * Routines for handling pagesets.
13510 + * Note that pbes aren't actually stored as such. They're stored as
13511 + * bitmaps and extents.
13512 + */
13513 +
13514 +#include <linux/suspend.h>
13515 +#include <linux/highmem.h>
13516 +#include <linux/bootmem.h>
13517 +#include <linux/hardirq.h>
13518 +#include <linux/sched.h>
13519 +#include <asm/tlbflush.h>
13520 +
13521 +#include "tuxonice_pageflags.h"
13522 +#include "tuxonice_ui.h"
13523 +#include "tuxonice_pagedir.h"
13524 +#include "tuxonice_prepare_image.h"
13525 +#include "tuxonice.h"
13526 +#include "power.h"
13527 +#include "tuxonice_builtin.h"
13528 +#include "tuxonice_alloc.h"
13529 +
13530 +static int ptoi_pfn;
13531 +static struct pbe *this_low_pbe;
13532 +static struct pbe **last_low_pbe_ptr;
13533 +
13534 +void toi_reset_alt_image_pageset2_pfn(void)
13535 +{
13536 +       ptoi_pfn = max_pfn + 1;
13537 +}
13538 +
13539 +static struct page *first_conflicting_page;
13540 +
13541 +/*
13542 + * free_conflicting_pages
13543 + */
13544 +
13545 +void free_conflicting_pages(void)
13546 +{
13547 +       while (first_conflicting_page) {
13548 +               struct page *next =
13549 +                       *((struct page **) kmap(first_conflicting_page));
13550 +               kunmap(first_conflicting_page);
13551 +               toi__free_page(29, first_conflicting_page);
13552 +               first_conflicting_page = next;
13553 +       }
13554 +}
13555 +
13556 +/* __toi_get_nonconflicting_page
13557 + *
13558 + * Description: Gets order zero pages that won't be overwritten
13559 + *             while copying the original pages.
13560 + */
13561 +
13562 +struct page *___toi_get_nonconflicting_page(int can_be_highmem)
13563 +{
13564 +       struct page *page;
13565 +       int flags = TOI_ATOMIC_GFP;
13566 +       if (can_be_highmem)
13567 +               flags |= __GFP_HIGHMEM;
13568 +
13569 +
13570 +       if (test_toi_state(TOI_LOADING_ALT_IMAGE) && pageset2_map.bitmap &&
13571 +                               (ptoi_pfn < (max_pfn + 2))) {
13572 +               /*
13573 +                * ptoi_pfn = max_pfn + 1 when yet to find first ps2 pfn that
13574 +                * can be used.
13575 +                *         = 0..max_pfn when going through list.
13576 +                *         = max_pfn + 2 when gone through whole list.
13577 +                */
13578 +               do {
13579 +                       ptoi_pfn = get_next_bit_on(&pageset2_map, ptoi_pfn);
13580 +                       if (ptoi_pfn <= max_pfn) {
13581 +                               page = pfn_to_page(ptoi_pfn);
13582 +                               if (!PagePageset1(page) &&
13583 +                                   (can_be_highmem || !PageHighMem(page)))
13584 +                                       return page;
13585 +                       } else
13586 +                               ptoi_pfn++;
13587 +               } while (ptoi_pfn < max_pfn);
13588 +       }
13589 +
13590 +       do {
13591 +               page = toi_alloc_page(29, flags);
13592 +               if (!page) {
13593 +                       printk(KERN_INFO "Failed to get nonconflicting "
13594 +                                       "page.\n");
13595 +                       return 0;
13596 +               }
13597 +               if (PagePageset1(page)) {
13598 +                       struct page **next = (struct page **) kmap(page);
13599 +                       *next = first_conflicting_page;
13600 +                       first_conflicting_page = page;
13601 +                       kunmap(page);
13602 +               }
13603 +       } while (PagePageset1(page));
13604 +
13605 +       return page;
13606 +}
13607 +
13608 +unsigned long __toi_get_nonconflicting_page(void)
13609 +{
13610 +       struct page *page = ___toi_get_nonconflicting_page(0);
13611 +       return page ? (unsigned long) page_address(page) : 0;
13612 +}
13613 +
13614 +struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
13615 +               int highmem)
13616 +{
13617 +       if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
13618 +                    + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
13619 +               struct page *new_page =
13620 +                       ___toi_get_nonconflicting_page(highmem);
13621 +               if (!new_page)
13622 +                       return ERR_PTR(-ENOMEM);
13623 +               this_pbe = (struct pbe *) kmap(new_page);
13624 +               memset(this_pbe, 0, PAGE_SIZE);
13625 +               *page_ptr = new_page;
13626 +       } else
13627 +               this_pbe++;
13628 +
13629 +       return this_pbe;
13630 +}
13631 +
13632 +/* get_pageset1_load_addresses
13633 + *
13634 + * Description: We check here that pagedir & pages it points to won't collide
13635 + *             with pages where we're going to restore from the loaded pages
13636 + *             later.
13637 + * Returns:    Zero on success, one if couldn't find enough pages (shouldn't
13638 + *             happen).
13639 + */
13640 +
13641 +int toi_get_pageset1_load_addresses(void)
13642 +{
13643 +       int pfn, highallocd = 0, lowallocd = 0;
13644 +       int low_needed = pagedir1.size - get_highmem_size(pagedir1);
13645 +       int high_needed = get_highmem_size(pagedir1);
13646 +       int low_pages_for_highmem = 0;
13647 +       unsigned long flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
13648 +       struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
13649 +                   *low_pbe_page;
13650 +       struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
13651 +                  *this_high_pbe = NULL;
13652 +       int orig_low_pfn = max_pfn + 1, orig_high_pfn = max_pfn + 1;
13653 +       int high_pbes_done = 0, low_pbes_done = 0;
13654 +       int low_direct = 0, high_direct = 0;
13655 +       int high_to_free, low_to_free;
13656 +
13657 +       last_low_pbe_ptr = &restore_pblist;
13658 +
13659 +       /* First, allocate pages for the start of our pbe lists. */
13660 +       if (high_needed) {
13661 +               high_pbe_page = ___toi_get_nonconflicting_page(1);
13662 +               if (!high_pbe_page)
13663 +                       return 1;
13664 +               this_high_pbe = (struct pbe *) kmap(high_pbe_page);
13665 +               memset(this_high_pbe, 0, PAGE_SIZE);
13666 +       }
13667 +
13668 +       low_pbe_page = ___toi_get_nonconflicting_page(0);
13669 +       if (!low_pbe_page)
13670 +               return 1;
13671 +       this_low_pbe = (struct pbe *) page_address(low_pbe_page);
13672 +
13673 +       /*
13674 +        * Next, allocate all possible memory to find where we can
13675 +        * load data directly into destination pages. I'd like to do
13676 +        * this in bigger chunks, but then we can't free pages
13677 +        * individually later.
13678 +        */
13679 +
13680 +       do {
13681 +               page = toi_alloc_page(30, flags);
13682 +               if (page)
13683 +                       SetPagePageset1Copy(page);
13684 +       } while (page);
13685 +
13686 +       /*
13687 +        * Find out how many high- and lowmem pages we allocated above,
13688 +        * and how many pages we can reload directly to their original
13689 +        * location.
13690 +        */
13691 +       BITMAP_FOR_EACH_SET(&pageset1_copy_map, pfn) {
13692 +               int is_high;
13693 +               page = pfn_to_page(pfn);
13694 +               is_high = PageHighMem(page);
13695 +
13696 +               if (PagePageset1(page)) {
13697 +                       if (test_action_state(TOI_NO_DIRECT_LOAD)) {
13698 +                               ClearPagePageset1Copy(page);
13699 +                               toi__free_page(30, page);
13700 +                               continue;
13701 +                       } else {
13702 +                               if (is_high)
13703 +                                       high_direct++;
13704 +                               else
13705 +                                       low_direct++;
13706 +                       }
13707 +               } else {
13708 +                       if (is_high)
13709 +                               highallocd++;
13710 +                       else
13711 +                               lowallocd++;
13712 +               }
13713 +       }
13714 +
13715 +       high_needed -= high_direct;
13716 +       low_needed -= low_direct;
13717 +
13718 +       /*
13719 +        * Do we need to use some lowmem pages for the copies of highmem
13720 +        * pages?
13721 +        */
13722 +       if (high_needed > highallocd) {
13723 +               low_pages_for_highmem = high_needed - highallocd;
13724 +               high_needed -= low_pages_for_highmem;
13725 +               low_needed += low_pages_for_highmem;
13726 +       }
13727 +
13728 +       high_to_free = highallocd - high_needed;
13729 +       low_to_free = lowallocd - low_needed;
13730 +
13731 +       /*
13732 +        * Now generate our pbes (which will be used for the atomic restore,
13733 +        * and free unneeded pages.
13734 +        */
13735 +       BITMAP_FOR_EACH_SET(&pageset1_copy_map, pfn) {
13736 +               int is_high;
13737 +               page = pfn_to_page(pfn);
13738 +               is_high = PageHighMem(page);
13739 +
13740 +               if (PagePageset1(page))
13741 +                       continue;
13742 +
13743 +               /* Free the page? */
13744 +               if ((is_high && high_to_free) ||
13745 +                   (!is_high && low_to_free)) {
13746 +                       ClearPagePageset1Copy(page);
13747 +                       toi__free_page(30, page);
13748 +                       if (is_high)
13749 +                               high_to_free--;
13750 +                       else
13751 +                               low_to_free--;
13752 +                       continue;
13753 +               }
13754 +
13755 +               /* Nope. We're going to use this page. Add a pbe. */
13756 +               if (is_high || low_pages_for_highmem) {
13757 +                       struct page *orig_page;
13758 +                       high_pbes_done++;
13759 +                       if (!is_high)
13760 +                               low_pages_for_highmem--;
13761 +                       do {
13762 +                               orig_high_pfn = get_next_bit_on(&pageset1_map,
13763 +                                               orig_high_pfn);
13764 +                               BUG_ON(orig_high_pfn > max_pfn);
13765 +                               orig_page = pfn_to_page(orig_high_pfn);
13766 +                       } while (!PageHighMem(orig_page) ||
13767 +                                       load_direct(orig_page));
13768 +
13769 +                       this_high_pbe->orig_address = orig_page;
13770 +                       this_high_pbe->address = page;
13771 +                       this_high_pbe->next = NULL;
13772 +                       if (last_high_pbe_page != high_pbe_page) {
13773 +                               *last_high_pbe_ptr =
13774 +                                       (struct pbe *) high_pbe_page;
13775 +                               if (!last_high_pbe_page)
13776 +                                       last_high_pbe_page = high_pbe_page;
13777 +                       } else
13778 +                               *last_high_pbe_ptr = this_high_pbe;
13779 +                       last_high_pbe_ptr = &this_high_pbe->next;
13780 +                       if (last_high_pbe_page != high_pbe_page) {
13781 +                               kunmap(last_high_pbe_page);
13782 +                               last_high_pbe_page = high_pbe_page;
13783 +                       }
13784 +                       this_high_pbe = get_next_pbe(&high_pbe_page,
13785 +                                       this_high_pbe, 1);
13786 +                       if (IS_ERR(this_high_pbe)) {
13787 +                               printk(KERN_INFO
13788 +                                               "This high pbe is an error.\n");
13789 +                               return -ENOMEM;
13790 +                       }
13791 +               } else {
13792 +                       struct page *orig_page;
13793 +                       low_pbes_done++;
13794 +                       do {
13795 +                               orig_low_pfn = get_next_bit_on(&pageset1_map,
13796 +                                               orig_low_pfn);
13797 +                               BUG_ON(orig_low_pfn > max_pfn);
13798 +                               orig_page = pfn_to_page(orig_low_pfn);
13799 +                       } while (PageHighMem(orig_page) ||
13800 +                                       load_direct(orig_page));
13801 +
13802 +                       this_low_pbe->orig_address = page_address(orig_page);
13803 +                       this_low_pbe->address = page_address(page);
13804 +                       this_low_pbe->next = NULL;
13805 +                       *last_low_pbe_ptr = this_low_pbe;
13806 +                       last_low_pbe_ptr = &this_low_pbe->next;
13807 +                       this_low_pbe = get_next_pbe(&low_pbe_page,
13808 +                                       this_low_pbe, 0);
13809 +                       if (IS_ERR(this_low_pbe)) {
13810 +                               printk(KERN_INFO "this_low_pbe is an error.\n");
13811 +                               return -ENOMEM;
13812 +                       }
13813 +               }
13814 +       }
13815 +
13816 +       if (high_pbe_page)
13817 +               kunmap(high_pbe_page);
13818 +
13819 +       if (last_high_pbe_page != high_pbe_page) {
13820 +               if (last_high_pbe_page)
13821 +                       kunmap(last_high_pbe_page);
13822 +               toi__free_page(29, high_pbe_page);
13823 +       }
13824 +
13825 +       free_conflicting_pages();
13826 +
13827 +       return 0;
13828 +}
13829 +
13830 +int add_boot_kernel_data_pbe(void)
13831 +{
13832 +       this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
13833 +       if (!this_low_pbe->address) {
13834 +               printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
13835 +               return -ENOMEM;
13836 +       }
13837 +
13838 +       toi_bkd.size = sizeof(toi_bkd);
13839 +       memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
13840 +
13841 +       *last_low_pbe_ptr = this_low_pbe;
13842 +       this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
13843 +       this_low_pbe->next = NULL;
13844 +       return 0;
13845 +}
13846 diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
13847 new file mode 100644
13848 index 0000000..081e744
13849 --- /dev/null
13850 +++ b/kernel/power/tuxonice_pagedir.h
13851 @@ -0,0 +1,50 @@
13852 +/*
13853 + * kernel/power/tuxonice_pagedir.h
13854 + *
13855 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
13856 + *
13857 + * This file is released under the GPLv2.
13858 + *
13859 + * Declarations for routines for handling pagesets.
13860 + */
13861 +
13862 +#ifndef KERNEL_POWER_PAGEDIR_H
13863 +#define KERNEL_POWER_PAGEDIR_H
13864 +
13865 +/* Pagedir
13866 + *
13867 + * Contains the metadata for a set of pages saved in the image.
13868 + */
13869 +
13870 +struct pagedir {
13871 +       int id;
13872 +       long size;
13873 +#ifdef CONFIG_HIGHMEM
13874 +       long size_high;
13875 +#endif
13876 +};
13877 +
13878 +#ifdef CONFIG_HIGHMEM
13879 +#define get_highmem_size(pagedir) (pagedir.size_high)
13880 +#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
13881 +#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
13882 +#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
13883 +#else
13884 +#define get_highmem_size(pagedir) (0)
13885 +#define set_highmem_size(pagedir, sz) do { } while (0)
13886 +#define inc_highmem_size(pagedir) do { } while (0)
13887 +#define get_lowmem_size(pagedir) (pagedir.size)
13888 +#endif
13889 +
13890 +extern struct pagedir pagedir1, pagedir2;
13891 +
13892 +extern void toi_copy_pageset1(void);
13893 +
13894 +extern int toi_get_pageset1_load_addresses(void);
13895 +
13896 +extern unsigned long __toi_get_nonconflicting_page(void);
13897 +struct page *___toi_get_nonconflicting_page(int can_be_highmem);
13898 +
13899 +extern void toi_reset_alt_image_pageset2_pfn(void);
13900 +extern int add_boot_kernel_data_pbe(void);
13901 +#endif
13902 diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
13903 new file mode 100644
13904 index 0000000..574858c
13905 --- /dev/null
13906 +++ b/kernel/power/tuxonice_pageflags.c
13907 @@ -0,0 +1,162 @@
13908 +/*
13909 + * kernel/power/tuxonice_pageflags.c
13910 + *
13911 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
13912 + *
13913 + * This file is released under the GPLv2.
13914 + *
13915 + * Routines for serialising and relocating pageflags in which we
13916 + * store our image metadata.
13917 + */
13918 +
13919 +#include <linux/kernel.h>
13920 +#include <linux/mm.h>
13921 +#include <linux/module.h>
13922 +#include <linux/bitops.h>
13923 +#include <linux/list.h>
13924 +#include <linux/suspend.h>
13925 +#include "tuxonice_pageflags.h"
13926 +#include "tuxonice_modules.h"
13927 +#include "tuxonice_pagedir.h"
13928 +#include "tuxonice.h"
13929 +
13930 +DECLARE_DYN_PAGEFLAGS(pageset2_map);
13931 +DECLARE_DYN_PAGEFLAGS(page_resave_map);
13932 +DECLARE_DYN_PAGEFLAGS(io_map);
13933 +DECLARE_DYN_PAGEFLAGS(nosave_map);
13934 +DECLARE_DYN_PAGEFLAGS(free_map);
13935 +
13936 +static int pages_for_zone(struct zone *zone)
13937 +{
13938 +       return DIV_ROUND_UP(zone->spanned_pages, (PAGE_SIZE << 3));
13939 +}
13940 +
13941 +int toi_pageflags_space_needed(void)
13942 +{
13943 +       int total = 0;
13944 +       struct zone *zone;
13945 +
13946 +       for_each_zone(zone)
13947 +               if (populated_zone(zone))
13948 +                       total += sizeof(int) * 3 + pages_for_zone(zone) *
13949 +                               PAGE_SIZE;
13950 +
13951 +       total += sizeof(int);
13952 +
13953 +       return total;
13954 +}
13955 +
13956 +/* save_dyn_pageflags
13957 + *
13958 + * Description: Save a set of pageflags.
13959 + * Arguments:   struct dyn_pageflags *: Pointer to the bitmap being saved.
13960 + */
13961 +
13962 +void save_dyn_pageflags(struct dyn_pageflags *pagemap)
13963 +{
13964 +       int i, zone_idx, size, node = 0;
13965 +       struct zone *zone;
13966 +       struct pglist_data *pgdat;
13967 +
13968 +       if (!pagemap)
13969 +               return;
13970 +
13971 +       for_each_online_pgdat(pgdat) {
13972 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
13973 +                       zone = &pgdat->node_zones[zone_idx];
13974 +
13975 +                       if (!populated_zone(zone))
13976 +                               continue;
13977 +
13978 +                       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
13979 +                                       (char *) &node, sizeof(int));
13980 +                       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
13981 +                                       (char *) &zone_idx, sizeof(int));
13982 +                       size = pages_for_zone(zone);
13983 +                       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
13984 +                                       (char *) &size, sizeof(int));
13985 +
13986 +                       for (i = 0; i < size; i++) {
13987 +                               if (!pagemap->bitmap[node][zone_idx][i+2]) {
13988 +                                       printk(KERN_INFO "Sparse pagemap?\n");
13989 +                                       dump_pagemap(pagemap);
13990 +                                       BUG();
13991 +                               }
13992 +                               toiActiveAllocator->rw_header_chunk(WRITE,
13993 +                                       NULL, (char *) pagemap->bitmap[node]
13994 +                                               [zone_idx][i+2],
13995 +                                       PAGE_SIZE);
13996 +                       }
13997 +               }
13998 +               node++;
13999 +       }
14000 +       node = -1;
14001 +       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
14002 +                       (char *) &node, sizeof(int));
14003 +}
14004 +
14005 +/* load_dyn_pageflags
14006 + *
14007 + * Description: Load a set of pageflags.
14008 + * Arguments:   struct dyn_pageflags *: Pointer to the bitmap being loaded.
14009 + *              (It must be allocated before calling this routine).
14010 + */
14011 +
14012 +int load_dyn_pageflags(struct dyn_pageflags *pagemap)
14013 +{
14014 +       int i, zone_idx, zone_check = 0, size, node = 0;
14015 +       struct zone *zone;
14016 +       struct pglist_data *pgdat;
14017 +
14018 +       if (!pagemap)
14019 +               return 1;
14020 +
14021 +       for_each_online_pgdat(pgdat) {
14022 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
14023 +                       zone = &pgdat->node_zones[zone_idx];
14024 +
14025 +                       if (!populated_zone(zone))
14026 +                               continue;
14027 +
14028 +                       /* Same node? */
14029 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
14030 +                                       (char *) &zone_check, sizeof(int));
14031 +                       if (zone_check != node) {
14032 +                               printk(KERN_INFO "Node read (%d) != node "
14033 +                                               "(%d).\n",
14034 +                                               zone_check, node);
14035 +                               return 1;
14036 +                       }
14037 +
14038 +                       /* Same zone? */
14039 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
14040 +                                       (char *) &zone_check, sizeof(int));
14041 +                       if (zone_check != zone_idx) {
14042 +                               printk(KERN_INFO "Zone read (%d) != node "
14043 +                                               "(%d).\n",
14044 +                                               zone_check, zone_idx);
14045 +                               return 1;
14046 +                       }
14047 +
14048 +
14049 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
14050 +                               (char *) &size, sizeof(int));
14051 +
14052 +                       for (i = 0; i < size; i++)
14053 +                               toiActiveAllocator->rw_header_chunk(READ, NULL,
14054 +                                       (char *) pagemap->bitmap[node][zone_idx]
14055 +                                                                       [i+2],
14056 +                                       PAGE_SIZE);
14057 +               }
14058 +               node++;
14059 +       }
14060 +       toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &zone_check,
14061 +                       sizeof(int));
14062 +       if (zone_check != -1) {
14063 +               printk(KERN_INFO "Didn't read end of dyn pageflag data marker."
14064 +                               "(%x)\n", zone_check);
14065 +               return 1;
14066 +       }
14067 +
14068 +       return 0;
14069 +}
14070 diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
14071 new file mode 100644
14072 index 0000000..f976b5c
14073 --- /dev/null
14074 +++ b/kernel/power/tuxonice_pageflags.h
14075 @@ -0,0 +1,63 @@
14076 +/*
14077 + * kernel/power/tuxonice_pageflags.h
14078 + *
14079 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
14080 + *
14081 + * This file is released under the GPLv2.
14082 + *
14083 + * TuxOnIce needs a few pageflags while working that aren't otherwise
14084 + * used. To save the struct page pageflags, we dynamically allocate
14085 + * a bitmap and use that. These are the only non order-0 allocations
14086 + * we do.
14087 + *
14088 + * NOTE!!!
14089 + * We assume that PAGE_SIZE - sizeof(void *) is a multiple of
14090 + * sizeof(unsigned long). Is this ever false?
14091 + */
14092 +
14093 +#include <linux/dyn_pageflags.h>
14094 +#include <linux/suspend.h>
14095 +
14096 +extern struct dyn_pageflags pageset1_map;
14097 +extern struct dyn_pageflags pageset1_copy_map;
14098 +extern struct dyn_pageflags pageset2_map;
14099 +extern struct dyn_pageflags page_resave_map;
14100 +extern struct dyn_pageflags io_map;
14101 +extern struct dyn_pageflags nosave_map;
14102 +extern struct dyn_pageflags free_map;
14103 +
14104 +#define PagePageset1(page) (test_dynpageflag(&pageset1_map, page))
14105 +#define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page))
14106 +#define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page))
14107 +
14108 +#define PagePageset1Copy(page) (test_dynpageflag(&pageset1_copy_map, page))
14109 +#define SetPagePageset1Copy(page) (set_dynpageflag(&pageset1_copy_map, page))
14110 +#define ClearPagePageset1Copy(page) \
14111 +       (clear_dynpageflag(&pageset1_copy_map, page))
14112 +
14113 +#define PagePageset2(page) (test_dynpageflag(&pageset2_map, page))
14114 +#define SetPagePageset2(page) (set_dynpageflag(&pageset2_map, page))
14115 +#define ClearPagePageset2(page) (clear_dynpageflag(&pageset2_map, page))
14116 +
14117 +#define PageWasRW(page) (test_dynpageflag(&pageset2_map, page))
14118 +#define SetPageWasRW(page) (set_dynpageflag(&pageset2_map, page))
14119 +#define ClearPageWasRW(page) (clear_dynpageflag(&pageset2_map, page))
14120 +
14121 +#define PageResave(page) (page_resave_map.bitmap ? \
14122 +       test_dynpageflag(&page_resave_map, page) : 0)
14123 +#define SetPageResave(page) (set_dynpageflag(&page_resave_map, page))
14124 +#define ClearPageResave(page) (clear_dynpageflag(&page_resave_map, page))
14125 +
14126 +#define PageNosave(page) (nosave_map.bitmap ? \
14127 +               test_dynpageflag(&nosave_map, page) : 0)
14128 +#define SetPageNosave(page) (set_dynpageflag(&nosave_map, page))
14129 +#define ClearPageNosave(page) (clear_dynpageflag(&nosave_map, page))
14130 +
14131 +#define PageNosaveFree(page) (free_map.bitmap ? \
14132 +               test_dynpageflag(&free_map, page) : 0)
14133 +#define SetPageNosaveFree(page) (set_dynpageflag(&free_map, page))
14134 +#define ClearPageNosaveFree(page) (clear_dynpageflag(&free_map, page))
14135 +
14136 +extern void save_dyn_pageflags(struct dyn_pageflags *pagemap);
14137 +extern int load_dyn_pageflags(struct dyn_pageflags *pagemap);
14138 +extern int toi_pageflags_space_needed(void);
14139 diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
14140 new file mode 100644
14141 index 0000000..70168e6
14142 --- /dev/null
14143 +++ b/kernel/power/tuxonice_power_off.c
14144 @@ -0,0 +1,276 @@
14145 +/*
14146 + * kernel/power/tuxonice_power_off.c
14147 + *
14148 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
14149 + *
14150 + * This file is released under the GPLv2.
14151 + *
14152 + * Support for powering down.
14153 + */
14154 +
14155 +#include <linux/device.h>
14156 +#include <linux/suspend.h>
14157 +#include <linux/mm.h>
14158 +#include <linux/pm.h>
14159 +#include <linux/reboot.h>
14160 +#include <linux/cpu.h>
14161 +#include <linux/console.h>
14162 +#include <linux/fs.h>
14163 +#include "tuxonice.h"
14164 +#include "tuxonice_ui.h"
14165 +#include "tuxonice_power_off.h"
14166 +#include "tuxonice_sysfs.h"
14167 +#include "tuxonice_modules.h"
14168 +
14169 +unsigned long toi_poweroff_method; /* 0 - Kernel power off */
14170 +EXPORT_SYMBOL_GPL(toi_poweroff_method);
14171 +
14172 +int wake_delay;
14173 +static char lid_state_file[256], wake_alarm_dir[256];
14174 +static struct file *lid_file, *alarm_file, *epoch_file;
14175 +int post_wake_state = -1;
14176 +
14177 +static int did_suspend_to_both;
14178 +
14179 +/*
14180 + * __toi_power_down
14181 + * Functionality   : Powers down or reboots the computer once the image
14182 + *                   has been written to disk.
14183 + * Key Assumptions : Able to reboot/power down via code called or that
14184 + *                   the warning emitted if the calls fail will be visible
14185 + *                   to the user (ie printk resumes devices).
14186 + */
14187 +
14188 +static void __toi_power_down(int method)
14189 +{
14190 +       int error;
14191 +
14192 +       if (test_action_state(TOI_REBOOT)) {
14193 +               toi_cond_pause(1, "Ready to reboot.");
14194 +               kernel_restart(NULL);
14195 +       }
14196 +
14197 +       toi_cond_pause(1, "Powering down.");
14198 +
14199 +       switch (method) {
14200 +       case 0:
14201 +               break;
14202 +       case 3:
14203 +               error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
14204 +               if (!error)
14205 +                       error = suspend_devices_and_enter(PM_SUSPEND_MEM);
14206 +               pm_notifier_call_chain(PM_POST_SUSPEND);
14207 +               if (!error) {
14208 +                       did_suspend_to_both = 1;
14209 +                       return;
14210 +               }
14211 +               break;
14212 +       case 4:
14213 +               if (!hibernation_platform_enter())
14214 +                       return;
14215 +               break;
14216 +       case 5:
14217 +               /* Historic entry only now */
14218 +               break;
14219 +       }
14220 +
14221 +       if (method && method != 5)
14222 +               toi_cond_pause(1,
14223 +                       "Falling back to alternate power off method.");
14224 +
14225 +       if (test_result_state(TOI_ABORTED))
14226 +               return;
14227 +
14228 +       kernel_power_off();
14229 +       kernel_halt();
14230 +       toi_cond_pause(1, "Powerdown failed.");
14231 +       while (1)
14232 +               cpu_relax();
14233 +}
14234 +
14235 +#define CLOSE_FILE(file) \
14236 +       if (file) { \
14237 +               filp_close(file, NULL); file = NULL; \
14238 +       }
14239 +
14240 +static void powerdown_cleanup(int toi_or_resume)
14241 +{
14242 +       if (!toi_or_resume)
14243 +               return;
14244 +
14245 +       CLOSE_FILE(lid_file);
14246 +       CLOSE_FILE(alarm_file);
14247 +       CLOSE_FILE(epoch_file);
14248 +}
14249 +
14250 +static void open_file(char *format, char *arg, struct file **var, int mode,
14251 +               char *desc)
14252 +{
14253 +       char buf[256];
14254 +
14255 +       if (strlen(arg)) {
14256 +               sprintf(buf, format, arg);
14257 +               *var = filp_open(buf, mode, 0);
14258 +               if (IS_ERR(*var) || !*var) {
14259 +                       printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
14260 +                               desc, buf, *var);
14261 +                       *var = 0;
14262 +               }
14263 +       }
14264 +}
14265 +
14266 +static int powerdown_init(int toi_or_resume)
14267 +{
14268 +       if (!toi_or_resume)
14269 +               return 0;
14270 +
14271 +       did_suspend_to_both = 0;
14272 +
14273 +       open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
14274 +                       O_RDONLY, "lid");
14275 +
14276 +       if (strlen(wake_alarm_dir)) {
14277 +               open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
14278 +                               &alarm_file, O_WRONLY, "alarm");
14279 +
14280 +               open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
14281 +                               &epoch_file, O_RDONLY, "epoch");
14282 +       }
14283 +
14284 +       return 0;
14285 +}
14286 +
14287 +static int lid_closed(void)
14288 +{
14289 +       char array[25];
14290 +       ssize_t size;
14291 +       loff_t pos = 0;
14292 +
14293 +       if (!lid_file)
14294 +               return 0;
14295 +
14296 +       size = vfs_read(lid_file, (char __user *) array, 25, &pos);
14297 +       if ((int) size < 1) {
14298 +               printk(KERN_INFO "Failed to read lid state file (%d).\n",
14299 +                       (int) size);
14300 +               return 0;
14301 +       }
14302 +
14303 +       if (!strcmp(array, "state:      closed\n"))
14304 +               return 1;
14305 +
14306 +       return 0;
14307 +}
14308 +
14309 +static void write_alarm_file(int value)
14310 +{
14311 +       ssize_t size;
14312 +       char buf[40];
14313 +       loff_t pos = 0;
14314 +
14315 +       if (!alarm_file)
14316 +               return;
14317 +
14318 +       sprintf(buf, "%d\n", value);
14319 +
14320 +       size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
14321 +
14322 +       if (size < 0)
14323 +               printk(KERN_INFO "Error %d writing alarm value %s.\n",
14324 +                               (int) size, buf);
14325 +}
14326 +
14327 +/**
14328 + * toi_check_resleep: See whether to powerdown again after waking.
14329 + *
14330 + * After waking, check whether we should powerdown again in a (usually
14331 + * different) way. We only do this if the lid switch is still closed.
14332 + */
14333 +void toi_check_resleep(void)
14334 +{
14335 +       /* We only return if we suspended to ram and woke. */
14336 +       if (lid_closed() && post_wake_state >= 0)
14337 +               __toi_power_down(post_wake_state);
14338 +}
14339 +
14340 +void toi_power_down(void)
14341 +{
14342 +       if (alarm_file && wake_delay) {
14343 +               char array[25];
14344 +               loff_t pos = 0;
14345 +               size_t size = vfs_read(epoch_file, (char __user *) array, 25,
14346 +                               &pos);
14347 +
14348 +               if (((int) size) < 1)
14349 +                       printk(KERN_INFO "Failed to read epoch file (%d).\n",
14350 +                                       (int) size);
14351 +               else {
14352 +                       unsigned long since_epoch =
14353 +                               simple_strtol(array, NULL, 0);
14354 +
14355 +                       /* Clear any wakeup time. */
14356 +                       write_alarm_file(0);
14357 +
14358 +                       /* Set new wakeup time. */
14359 +                       write_alarm_file(since_epoch + wake_delay);
14360 +               }
14361 +       }
14362 +
14363 +       __toi_power_down(toi_poweroff_method);
14364 +
14365 +       toi_check_resleep();
14366 +}
14367 +EXPORT_SYMBOL_GPL(toi_power_down);
14368 +
14369 +static struct toi_sysfs_data sysfs_params[] = {
14370 +#if defined(CONFIG_ACPI)
14371 +       {
14372 +        TOI_ATTR("lid_file", SYSFS_RW),
14373 +        SYSFS_STRING(lid_state_file, 256, 0),
14374 +       },
14375 +
14376 +       {
14377 +         TOI_ATTR("wake_delay", SYSFS_RW),
14378 +         SYSFS_INT(&wake_delay, 0, INT_MAX, 0)
14379 +       },
14380 +
14381 +       {
14382 +         TOI_ATTR("wake_alarm_dir", SYSFS_RW),
14383 +         SYSFS_STRING(wake_alarm_dir, 256, 0)
14384 +       },
14385 +
14386 +       { TOI_ATTR("post_wake_state", SYSFS_RW),
14387 +         SYSFS_INT(&post_wake_state, -1, 5, 0)
14388 +       },
14389 +
14390 +       { TOI_ATTR("powerdown_method", SYSFS_RW),
14391 +         SYSFS_UL(&toi_poweroff_method, 0, 5, 0)
14392 +       },
14393 +
14394 +       { TOI_ATTR("did_suspend_to_both", SYSFS_READONLY),
14395 +         SYSFS_INT(&did_suspend_to_both, 0, 0, 0)
14396 +       },
14397 +#endif
14398 +};
14399 +
14400 +static struct toi_module_ops powerdown_ops = {
14401 +       .type                           = MISC_HIDDEN_MODULE,
14402 +       .name                           = "poweroff",
14403 +       .initialise                     = powerdown_init,
14404 +       .cleanup                        = powerdown_cleanup,
14405 +       .directory                      = "[ROOT]",
14406 +       .module                         = THIS_MODULE,
14407 +       .sysfs_data                     = sysfs_params,
14408 +       .num_sysfs_entries              = sizeof(sysfs_params) /
14409 +               sizeof(struct toi_sysfs_data),
14410 +};
14411 +
14412 +int toi_poweroff_init(void)
14413 +{
14414 +       return toi_register_module(&powerdown_ops);
14415 +}
14416 +
14417 +void toi_poweroff_exit(void)
14418 +{
14419 +       toi_unregister_module(&powerdown_ops);
14420 +}
14421 diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
14422 new file mode 100644
14423 index 0000000..540b081
14424 --- /dev/null
14425 +++ b/kernel/power/tuxonice_power_off.h
14426 @@ -0,0 +1,35 @@
14427 +/*
14428 + * kernel/power/tuxonice_power_off.h
14429 + *
14430 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
14431 + *
14432 + * This file is released under the GPLv2.
14433 + *
14434 + * Support for the powering down.
14435 + */
14436 +
14437 +int toi_pm_state_finish(void);
14438 +void toi_power_down(void);
14439 +extern unsigned long toi_poweroff_method;
14440 +extern int toi_platform_prepare(void);
14441 +extern void toi_platform_end(void);
14442 +int toi_poweroff_init(void);
14443 +void toi_poweroff_exit(void);
14444 +void toi_check_resleep(void);
14445 +
14446 +extern int platform_begin(int platform_mode);
14447 +extern int platform_pre_snapshot(int platform_mode);
14448 +extern int platform_leave(int platform_mode);
14449 +extern int platform_end(int platform_mode);
14450 +extern int platform_finish(int platform_mode);
14451 +extern int platform_pre_restore(int platform_mode);
14452 +extern int platform_restore_cleanup(int platform_mode);
14453 +
14454 +#define platform_test() (toi_poweroff_method == 4)
14455 +#define toi_platform_begin() platform_begin(platform_test())
14456 +#define toi_platform_pre_snapshot() platform_pre_snapshot(platform_test())
14457 +#define toi_platform_leave() platform_leave(platform_test())
14458 +#define toi_platform_end() platform_end(platform_test())
14459 +#define toi_platform_finish() platform_finish(platform_test())
14460 +#define toi_platform_pre_restore() platform_pre_restore(platform_test())
14461 +#define toi_platform_restore_cleanup() platform_restore_cleanup(platform_test())
14462 diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
14463 new file mode 100644
14464 index 0000000..ec507e8
14465 --- /dev/null
14466 +++ b/kernel/power/tuxonice_prepare_image.c
14467 @@ -0,0 +1,1052 @@
14468 +/*
14469 + * kernel/power/tuxonice_prepare_image.c
14470 + *
14471 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
14472 + *
14473 + * This file is released under the GPLv2.
14474 + *
14475 + * We need to eat memory until we can:
14476 + * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
14477 + * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
14478 + *    main_storage_needed())
14479 + * 3. Reload the pagedir and pageset1 to places that don't collide with their
14480 + *    final destinations, not knowing to what extent the resumed kernel will
14481 + *    overlap with the one loaded at boot time. I think the resumed kernel
14482 + *    should overlap completely, but I don't want to rely on this as it is
14483 + *    an unproven assumption. We therefore assume there will be no overlap at
14484 + *    all (worse case).
14485 + * 4. Meet the user's requested limit (if any) on the size of the image.
14486 + *    The limit is in MB, so pages/256 (assuming 4K pages).
14487 + *
14488 + */
14489 +
14490 +#include <linux/module.h>
14491 +#include <linux/highmem.h>
14492 +#include <linux/freezer.h>
14493 +#include <linux/hardirq.h>
14494 +#include <linux/mmzone.h>
14495 +#include <linux/console.h>
14496 +
14497 +#include "tuxonice_pageflags.h"
14498 +#include "tuxonice_modules.h"
14499 +#include "tuxonice_io.h"
14500 +#include "tuxonice_ui.h"
14501 +#include "tuxonice_extent.h"
14502 +#include "tuxonice_prepare_image.h"
14503 +#include "tuxonice_block_io.h"
14504 +#include "tuxonice.h"
14505 +#include "tuxonice_checksum.h"
14506 +#include "tuxonice_sysfs.h"
14507 +#include "tuxonice_alloc.h"
14508 +
14509 +static long num_nosave, header_space_allocated, main_storage_allocated,
14510 +          storage_available;
14511 +long extra_pd1_pages_allowance = DEFAULT_EXTRA_PAGES_ALLOWANCE;
14512 +int image_size_limit;
14513 +
14514 +struct attention_list {
14515 +       struct task_struct *task;
14516 +       struct attention_list *next;
14517 +};
14518 +
14519 +static struct attention_list *attention_list;
14520 +
14521 +#define PAGESET1 0
14522 +#define PAGESET2 1
14523 +
14524 +void free_attention_list(void)
14525 +{
14526 +       struct attention_list *last = NULL;
14527 +
14528 +       while (attention_list) {
14529 +               last = attention_list;
14530 +               attention_list = attention_list->next;
14531 +               toi_kfree(6, last);
14532 +       }
14533 +}
14534 +
14535 +static int build_attention_list(void)
14536 +{
14537 +       int i, task_count = 0;
14538 +       struct task_struct *p;
14539 +       struct attention_list *next;
14540 +
14541 +       /*
14542 +        * Count all userspace process (with task->mm) marked PF_NOFREEZE.
14543 +        */
14544 +       read_lock(&tasklist_lock);
14545 +       for_each_process(p)
14546 +               if ((p->flags & PF_NOFREEZE) || p == current)
14547 +                       task_count++;
14548 +       read_unlock(&tasklist_lock);
14549 +
14550 +       /*
14551 +        * Allocate attention list structs.
14552 +        */
14553 +       for (i = 0; i < task_count; i++) {
14554 +               struct attention_list *this =
14555 +                       toi_kzalloc(6, sizeof(struct attention_list),
14556 +                                       TOI_WAIT_GFP);
14557 +               if (!this) {
14558 +                       printk(KERN_INFO "Failed to allocate slab for "
14559 +                                       "attention list.\n");
14560 +                       free_attention_list();
14561 +                       return 1;
14562 +               }
14563 +               this->next = NULL;
14564 +               if (attention_list)
14565 +                       this->next = attention_list;
14566 +               attention_list = this;
14567 +       }
14568 +
14569 +       next = attention_list;
14570 +       read_lock(&tasklist_lock);
14571 +       for_each_process(p)
14572 +               if ((p->flags & PF_NOFREEZE) || p == current) {
14573 +                       next->task = p;
14574 +                       next = next->next;
14575 +               }
14576 +       read_unlock(&tasklist_lock);
14577 +       return 0;
14578 +}
14579 +
14580 +static void pageset2_full(void)
14581 +{
14582 +       struct zone *zone;
14583 +       unsigned long flags;
14584 +
14585 +       for_each_zone(zone) {
14586 +               spin_lock_irqsave(&zone->lru_lock, flags);
14587 +               if (zone_page_state(zone, NR_INACTIVE)) {
14588 +                       struct page *page;
14589 +                       list_for_each_entry(page, &zone->inactive_list, lru)
14590 +                               SetPagePageset2(page);
14591 +               }
14592 +               if (zone_page_state(zone, NR_ACTIVE)) {
14593 +                       struct page *page;
14594 +                       list_for_each_entry(page, &zone->active_list, lru)
14595 +                               SetPagePageset2(page);
14596 +               }
14597 +               spin_unlock_irqrestore(&zone->lru_lock, flags);
14598 +       }
14599 +}
14600 +
14601 +/*
14602 + * toi_mark_task_as_pageset
14603 + * Functionality   : Marks all the saveable pages belonging to a given process
14604 + *                  as belonging to a particular pageset.
14605 + */
14606 +
14607 +static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
14608 +{
14609 +       struct vm_area_struct *vma;
14610 +       struct mm_struct *mm;
14611 +
14612 +       mm = t->active_mm;
14613 +
14614 +       if (!mm || !mm->mmap)
14615 +               return;
14616 +
14617 +       if (!irqs_disabled())
14618 +               down_read(&mm->mmap_sem);
14619 +
14620 +       for (vma = mm->mmap; vma; vma = vma->vm_next) {
14621 +               unsigned long posn;
14622 +
14623 +               if (vma->vm_flags & (VM_PFNMAP | VM_IO | VM_RESERVED) ||
14624 +                   !vma->vm_start)
14625 +                       continue;
14626 +
14627 +               for (posn = vma->vm_start; posn < vma->vm_end;
14628 +                               posn += PAGE_SIZE) {
14629 +                       struct page *page = follow_page(vma, posn, 0);
14630 +                       if (!page)
14631 +                               continue;
14632 +
14633 +                       if (pageset2)
14634 +                               SetPagePageset2(page);
14635 +                       else {
14636 +                               ClearPagePageset2(page);
14637 +                               SetPagePageset1(page);
14638 +                       }
14639 +               }
14640 +       }
14641 +
14642 +       if (!irqs_disabled())
14643 +               up_read(&mm->mmap_sem);
14644 +}
14645 +
14646 +/* mark_pages_for_pageset2
14647 + *
14648 + * Description:        Mark unshared pages in processes not needed for hibernate as
14649 + *             being able to be written out in a separate pagedir.
14650 + *             HighMem pages are simply marked as pageset2. They won't be
14651 + *             needed during hibernate.
14652 + */
14653 +
14654 +static void toi_mark_pages_for_pageset2(void)
14655 +{
14656 +       struct task_struct *p;
14657 +       struct attention_list *this = attention_list;
14658 +
14659 +       if (test_action_state(TOI_NO_PAGESET2))
14660 +               return;
14661 +
14662 +       clear_dyn_pageflags(&pageset2_map);
14663 +
14664 +       if (test_action_state(TOI_PAGESET2_FULL))
14665 +               pageset2_full();
14666 +       else {
14667 +               read_lock(&tasklist_lock);
14668 +               for_each_process(p) {
14669 +                       if (!p->mm || (p->flags & PF_BORROWED_MM))
14670 +                               continue;
14671 +
14672 +                       toi_mark_task_as_pageset(p, PAGESET2);
14673 +               }
14674 +               read_unlock(&tasklist_lock);
14675 +       }
14676 +
14677 +       /*
14678 +        * Because the tasks in attention_list are ones related to hibernating,
14679 +        * we know that they won't go away under us.
14680 +        */
14681 +
14682 +       while (this) {
14683 +               if (!test_result_state(TOI_ABORTED))
14684 +                       toi_mark_task_as_pageset(this->task, PAGESET1);
14685 +               this = this->next;
14686 +       }
14687 +}
14688 +
14689 +/*
14690 + * The atomic copy of pageset1 is stored in pageset2 pages.
14691 + * But if pageset1 is larger (normally only just after boot),
14692 + * we need to allocate extra pages to store the atomic copy.
14693 + * The following data struct and functions are used to handle
14694 + * the allocation and freeing of that memory.
14695 + */
14696 +
14697 +static long extra_pages_allocated;
14698 +
14699 +struct extras {
14700 +       struct page *page;
14701 +       int order;
14702 +       struct extras *next;
14703 +};
14704 +
14705 +static struct extras *extras_list;
14706 +
14707 +/* toi_free_extra_pagedir_memory
14708 + *
14709 + * Description:        Free previously allocated extra pagedir memory.
14710 + */
14711 +void toi_free_extra_pagedir_memory(void)
14712 +{
14713 +       /* Free allocated pages */
14714 +       while (extras_list) {
14715 +               struct extras *this = extras_list;
14716 +               int i;
14717 +
14718 +               extras_list = this->next;
14719 +
14720 +               for (i = 0; i < (1 << this->order); i++)
14721 +                       ClearPageNosave(this->page + i);
14722 +
14723 +               toi_free_pages(9, this->page, this->order);
14724 +               toi_kfree(7, this);
14725 +       }
14726 +
14727 +       extra_pages_allocated = 0;
14728 +}
14729 +
14730 +/* toi_allocate_extra_pagedir_memory
14731 + *
14732 + * Description:        Allocate memory for making the atomic copy of pagedir1 in the
14733 + *             case where it is bigger than pagedir2.
14734 + * Arguments:  int     num_to_alloc: Number of extra pages needed.
14735 + * Result:     int.    Number of extra pages we now have allocated.
14736 + */
14737 +static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
14738 +{
14739 +       int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
14740 +       unsigned long flags = TOI_ATOMIC_GFP;
14741 +
14742 +       if (num_to_alloc < 1)
14743 +               return 0;
14744 +
14745 +       order = fls(num_to_alloc);
14746 +       if (order >= MAX_ORDER)
14747 +               order = MAX_ORDER - 1;
14748 +
14749 +       while (num_to_alloc) {
14750 +               struct page *newpage;
14751 +               unsigned long virt;
14752 +               struct extras *extras_entry;
14753 +
14754 +               while ((1 << order) > num_to_alloc)
14755 +                       order--;
14756 +
14757 +               extras_entry = (struct extras *) toi_kzalloc(7,
14758 +                       sizeof(struct extras), TOI_ATOMIC_GFP);
14759 +
14760 +               if (!extras_entry)
14761 +                       return extra_pages_allocated;
14762 +
14763 +               virt = toi_get_free_pages(9, flags, order);
14764 +               while (!virt && order) {
14765 +                       order--;
14766 +                       virt = toi_get_free_pages(9, flags, order);
14767 +               }
14768 +
14769 +               if (!virt) {
14770 +                       toi_kfree(7, extras_entry);
14771 +                       return extra_pages_allocated;
14772 +               }
14773 +
14774 +               newpage = virt_to_page(virt);
14775 +
14776 +               extras_entry->page = newpage;
14777 +               extras_entry->order = order;
14778 +               extras_entry->next = NULL;
14779 +
14780 +               if (extras_list)
14781 +                       extras_entry->next = extras_list;
14782 +
14783 +               extras_list = extras_entry;
14784 +
14785 +               for (j = 0; j < (1 << order); j++) {
14786 +                       SetPageNosave(newpage + j);
14787 +                       SetPagePageset1Copy(newpage + j);
14788 +               }
14789 +
14790 +               extra_pages_allocated += (1 << order);
14791 +               num_to_alloc -= (1 << order);
14792 +       }
14793 +
14794 +       return extra_pages_allocated;
14795 +}
14796 +
14797 +/*
14798 + * real_nr_free_pages: Count pcp pages for a zone type or all zones
14799 + * (-1 for all, otherwise zone_idx() result desired).
14800 + */
14801 +long real_nr_free_pages(unsigned long zone_idx_mask)
14802 +{
14803 +       struct zone *zone;
14804 +       int result = 0, cpu;
14805 +
14806 +       /* PCP lists */
14807 +       for_each_zone(zone) {
14808 +               if (!populated_zone(zone))
14809 +                       continue;
14810 +
14811 +               if (!(zone_idx_mask & (1 << zone_idx(zone))))
14812 +                       continue;
14813 +
14814 +               for_each_online_cpu(cpu) {
14815 +                       struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
14816 +                       struct per_cpu_pages *pcp = &pset->pcp;
14817 +                       result += pcp->count;
14818 +               }
14819 +
14820 +               result += zone_page_state(zone, NR_FREE_PAGES);
14821 +       }
14822 +       return result;
14823 +}
14824 +
14825 +/*
14826 + * Discover how much extra memory will be required by the drivers
14827 + * when they're asked to hibernate. We can then ensure that amount
14828 + * of memory is available when we really want it.
14829 + */
14830 +static void get_extra_pd1_allowance(void)
14831 +{
14832 +       long orig_num_free = real_nr_free_pages(all_zones_mask), final;
14833 +
14834 +       toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
14835 +
14836 +       suspend_console();
14837 +       device_suspend(PMSG_FREEZE);
14838 +       local_irq_disable(); /* irqs might have been re-enabled on us */
14839 +       device_power_down(PMSG_FREEZE);
14840 +
14841 +       final = real_nr_free_pages(all_zones_mask);
14842 +
14843 +       device_power_up();
14844 +       local_irq_enable();
14845 +       device_resume();
14846 +       resume_console();
14847 +
14848 +       extra_pd1_pages_allowance = max(
14849 +               orig_num_free - final + DEFAULT_EXTRA_PAGES_ALLOWANCE,
14850 +               (long) DEFAULT_EXTRA_PAGES_ALLOWANCE);
14851 +}
14852 +
14853 +/*
14854 + * Amount of storage needed, possibly taking into account the
14855 + * expected compression ratio and possibly also ignoring our
14856 + * allowance for extra pages.
14857 + */
14858 +static long main_storage_needed(int use_ecr,
14859 +               int ignore_extra_pd1_allow)
14860 +{
14861 +       return ((pagedir1.size + pagedir2.size +
14862 +         (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
14863 +        (use_ecr ? toi_expected_compression_ratio() : 100) / 100);
14864 +}
14865 +
14866 +/*
14867 + * Storage needed for the image header, in bytes until the return.
14868 + */
14869 +static long header_storage_needed(void)
14870 +{
14871 +       long bytes = (int) sizeof(struct toi_header) +
14872 +                       toi_header_storage_for_modules() +
14873 +                       toi_pageflags_space_needed();
14874 +
14875 +       return DIV_ROUND_UP(bytes, PAGE_SIZE);
14876 +}
14877 +
14878 +/*
14879 + * When freeing memory, pages from either pageset might be freed.
14880 + *
14881 + * When seeking to free memory to be able to hibernate, for every ps1 page
14882 + * freed, we need 2 less pages for the atomic copy because there is one less
14883 + * page to copy and one more page into which data can be copied.
14884 + *
14885 + * Freeing ps2 pages saves us nothing directly. No more memory is available
14886 + * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
14887 + * that's too much work to figure out.
14888 + *
14889 + * => ps1_to_free functions
14890 + *
14891 + * Of course if we just want to reduce the image size, because of storage
14892 + * limitations or an image size limit either ps will do.
14893 + *
14894 + * => any_to_free function
14895 + */
14896 +
14897 +static long highpages_ps1_to_free(void)
14898 +{
14899 +       return max_t(long, 0, DIV_ROUND_UP(get_highmem_size(pagedir1) -
14900 +               get_highmem_size(pagedir2), 2) - real_nr_free_high_pages());
14901 +}
14902 +
14903 +static long lowpages_ps1_to_free(void)
14904 +{
14905 +       return max_t(long, 0, DIV_ROUND_UP(get_lowmem_size(pagedir1) +
14906 +               extra_pd1_pages_allowance + MIN_FREE_RAM +
14907 +               toi_memory_for_modules(0) - get_lowmem_size(pagedir2) -
14908 +               real_nr_free_low_pages() - extra_pages_allocated, 2));
14909 +}
14910 +
14911 +static long current_image_size(void)
14912 +{
14913 +       return pagedir1.size + pagedir2.size + header_space_allocated;
14914 +}
14915 +
14916 +static long storage_still_required(void)
14917 +{
14918 +       return max_t(long, 0, main_storage_needed(1, 1) - storage_available);
14919 +}
14920 +
14921 +static long ram_still_required(void)
14922 +{
14923 +       return max_t(long, 0, MIN_FREE_RAM + toi_memory_for_modules(0) -
14924 +               real_nr_free_low_pages() + 2 * extra_pd1_pages_allowance);
14925 +}
14926 +
14927 +static long any_to_free(int use_image_size_limit)
14928 +{
14929 +       long user_limit = (use_image_size_limit && image_size_limit > 0) ?
14930 +               max_t(long, 0, current_image_size() - (image_size_limit << 8))
14931 +               : 0;
14932 +
14933 +       long storage_limit = storage_still_required(),
14934 +           ram_limit = ram_still_required();
14935 +
14936 +       return max(max(user_limit, storage_limit), ram_limit);
14937 +}
14938 +
14939 +/* amount_needed
14940 + *
14941 + * Calculates the amount by which the image size needs to be reduced to meet
14942 + * our constraints.
14943 + */
14944 +static long amount_needed(int use_image_size_limit)
14945 +{
14946 +       return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
14947 +                       any_to_free(use_image_size_limit));
14948 +}
14949 +
14950 +static long image_not_ready(int use_image_size_limit)
14951 +{
14952 +       toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
14953 +               "Amount still needed (%ld) > 0:%d. Header: %ld < %ld: %d,"
14954 +               " Storage allocd: %ld < %ld: %d.\n",
14955 +                       amount_needed(use_image_size_limit),
14956 +                       (amount_needed(use_image_size_limit) > 0),
14957 +                       header_space_allocated, header_storage_needed(),
14958 +                       header_space_allocated < header_storage_needed(),
14959 +                       main_storage_allocated,
14960 +                       main_storage_needed(1, 1),
14961 +                       main_storage_allocated < main_storage_needed(1, 1));
14962 +
14963 +       toi_cond_pause(0, NULL);
14964 +
14965 +       return ((amount_needed(use_image_size_limit) > 0) ||
14966 +               header_space_allocated < header_storage_needed() ||
14967 +                main_storage_allocated < main_storage_needed(1, 1));
14968 +}
14969 +
14970 +static void display_failure_reason(int tries_exceeded)
14971 +{
14972 +       long storage_required = storage_still_required(),
14973 +           ram_required = ram_still_required(),
14974 +           high_ps1 = highpages_ps1_to_free(),
14975 +           low_ps1 = lowpages_ps1_to_free();
14976 +
14977 +       printk(KERN_INFO "Failed to prepare the image because...\n");
14978 +
14979 +       if (!storage_available) {
14980 +               printk(KERN_INFO "- You need some storage available to be "
14981 +                               "able to hibernate.\n");
14982 +               return;
14983 +       }
14984 +
14985 +       if (tries_exceeded)
14986 +               printk(KERN_INFO "- The maximum number of iterations was "
14987 +                               "reached without successfully preparing the "
14988 +                               "image.\n");
14989 +
14990 +       if (header_space_allocated < header_storage_needed()) {
14991 +               printk(KERN_INFO "- Insufficient header storage allocated. "
14992 +                               "Need %ld, have %ld.\n", header_storage_needed(),
14993 +                               header_space_allocated);
14994 +               set_abort_result(TOI_INSUFFICIENT_STORAGE);
14995 +       }
14996 +
14997 +       if (storage_required) {
14998 +               printk(KERN_INFO " - We need at least %ld pages of storage "
14999 +                               "(ignoring the header), but only have %ld.\n",
15000 +                               main_storage_needed(1, 1),
15001 +                               main_storage_allocated);
15002 +               set_abort_result(TOI_INSUFFICIENT_STORAGE);
15003 +       }
15004 +
15005 +       if (ram_required) {
15006 +               printk(KERN_INFO " - We need %ld more free pages of low "
15007 +                               "memory.\n", ram_required);
15008 +               printk(KERN_INFO "     Minimum free     : %8d\n", MIN_FREE_RAM);
15009 +               printk(KERN_INFO "   + Reqd. by modules : %8ld\n",
15010 +                               toi_memory_for_modules(0));
15011 +               printk(KERN_INFO "   - Currently free   : %8ld\n",
15012 +                               real_nr_free_low_pages());
15013 +               printk(KERN_INFO "   + 2 * extra allow  : %8ld\n",
15014 +                               2 * extra_pd1_pages_allowance);
15015 +               printk(KERN_INFO "                      : ========\n");
15016 +               printk(KERN_INFO "     Still needed     : %8ld\n", ram_required);
15017 +
15018 +               /* Print breakdown of memory needed for modules */
15019 +               toi_memory_for_modules(1);
15020 +               set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
15021 +       }
15022 +
15023 +       if (high_ps1) {
15024 +               printk(KERN_INFO "- We need to free %ld highmem pageset 1 "
15025 +                               "pages.\n", high_ps1);
15026 +               set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
15027 +       }
15028 +
15029 +       if (low_ps1) {
15030 +               printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
15031 +                               "pages.\n", low_ps1);
15032 +               set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
15033 +       }
15034 +}
15035 +
15036 +static void display_stats(int always, int sub_extra_pd1_allow)
15037 +{
15038 +       char buffer[255];
15039 +       snprintf(buffer, 254,
15040 +               "Free:%ld(%ld). Sets:%ld(%ld),%ld(%ld). Header:%ld/%ld. "
15041 +               "Nosave:%ld-%ld=%ld. Storage:%lu/%lu(%lu=>%lu). "
15042 +               "Needed:%ld,%ld,%ld(%d,%ld,%ld,%ld)\n",
15043 +
15044 +               /* Free */
15045 +               real_nr_free_pages(all_zones_mask),
15046 +               real_nr_free_low_pages(),
15047 +
15048 +               /* Sets */
15049 +               pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
15050 +               pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
15051 +
15052 +               /* Header */
15053 +               header_space_allocated, header_storage_needed(),
15054 +
15055 +               /* Nosave */
15056 +               num_nosave, extra_pages_allocated,
15057 +               num_nosave - extra_pages_allocated,
15058 +
15059 +               /* Storage */
15060 +               main_storage_allocated,
15061 +               storage_available,
15062 +               main_storage_needed(1, sub_extra_pd1_allow),
15063 +               main_storage_needed(1, 1),
15064 +
15065 +               /* Needed */
15066 +               lowpages_ps1_to_free(), highpages_ps1_to_free(),
15067 +               any_to_free(1),
15068 +               MIN_FREE_RAM, toi_memory_for_modules(0),
15069 +               extra_pd1_pages_allowance, ((long) image_size_limit) << 8);
15070 +
15071 +       if (always)
15072 +               printk(buffer);
15073 +       else
15074 +               toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
15075 +}
15076 +
15077 +/* generate_free_page_map
15078 + *
15079 + * Description:        This routine generates a bitmap of free pages from the
15080 + *             lists used by the memory manager. We then use the bitmap
15081 + *             to quickly calculate which pages to save and in which
15082 + *             pagesets.
15083 + */
15084 +static void generate_free_page_map(void)
15085 +{
15086 +       int order, pfn, cpu, t;
15087 +       unsigned long flags, i;
15088 +       struct zone *zone;
15089 +       struct list_head *curr;
15090 +
15091 +       for_each_zone(zone) {
15092 +               if (!populated_zone(zone))
15093 +                       continue;
15094 +
15095 +               spin_lock_irqsave(&zone->lock, flags);
15096 +
15097 +               for (i = 0; i < zone->spanned_pages; i++)
15098 +                       ClearPageNosaveFree(pfn_to_page(
15099 +                                               zone->zone_start_pfn + i));
15100 +
15101 +               for_each_migratetype_order(order, t) {
15102 +                       list_for_each(curr,
15103 +                                       &zone->free_area[order].free_list[t]) {
15104 +                               unsigned long i;
15105 +
15106 +                               pfn = page_to_pfn(list_entry(curr, struct page,
15107 +                                                       lru));
15108 +                               for (i = 0; i < (1UL << order); i++)
15109 +                                       SetPageNosaveFree(pfn_to_page(pfn + i));
15110 +                       }
15111 +               }
15112 +
15113 +               for_each_online_cpu(cpu) {
15114 +                       struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
15115 +                       struct per_cpu_pages *pcp = &pset->pcp;
15116 +                       struct page *page;
15117 +
15118 +                       list_for_each_entry(page, &pcp->list, lru)
15119 +                               SetPageNosaveFree(page);
15120 +               }
15121 +
15122 +               spin_unlock_irqrestore(&zone->lock, flags);
15123 +       }
15124 +}
15125 +
15126 +/* size_of_free_region
15127 + *
15128 + * Description:        Return the number of pages that are free, beginning with and
15129 + *             including this one.
15130 + */
15131 +static int size_of_free_region(struct page *page)
15132 +{
15133 +       struct zone *zone = page_zone(page);
15134 +       unsigned long this_pfn = page_to_pfn(page),
15135 +                     orig_pfn = this_pfn,
15136 +                     end_pfn = zone->zone_start_pfn + zone->spanned_pages - 1;
15137 +
15138 +       while (this_pfn <= end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
15139 +               this_pfn++;
15140 +
15141 +       return (this_pfn - orig_pfn);
15142 +}
15143 +
15144 +/* flag_image_pages
15145 + *
15146 + * This routine generates our lists of pages to be stored in each
15147 + * pageset. Since we store the data using extents, and adding new
15148 + * extents might allocate a new extent page, this routine may well
15149 + * be called more than once.
15150 + */
15151 +static void flag_image_pages(int atomic_copy)
15152 +{
15153 +       int num_free = 0;
15154 +       unsigned long loop;
15155 +       struct zone *zone;
15156 +
15157 +       pagedir1.size = 0;
15158 +       pagedir2.size = 0;
15159 +
15160 +       set_highmem_size(pagedir1, 0);
15161 +       set_highmem_size(pagedir2, 0);
15162 +
15163 +       num_nosave = 0;
15164 +
15165 +       clear_dyn_pageflags(&pageset1_map);
15166 +
15167 +       generate_free_page_map();
15168 +
15169 +       /*
15170 +        * Pages not to be saved are marked Nosave irrespective of being
15171 +        * reserved.
15172 +        */
15173 +       for_each_zone(zone) {
15174 +               int highmem = is_highmem(zone);
15175 +
15176 +               if (!populated_zone(zone))
15177 +                       continue;
15178 +
15179 +               for (loop = 0; loop < zone->spanned_pages; loop++) {
15180 +                       unsigned long pfn = zone->zone_start_pfn + loop;
15181 +                       struct page *page;
15182 +                       int chunk_size;
15183 +
15184 +                       if (!pfn_valid(pfn))
15185 +                               continue;
15186 +
15187 +                       page = pfn_to_page(pfn);
15188 +
15189 +                       chunk_size = size_of_free_region(page);
15190 +                       if (chunk_size) {
15191 +                               num_free += chunk_size;
15192 +                               loop += chunk_size - 1;
15193 +                               continue;
15194 +                       }
15195 +
15196 +                       if (highmem)
15197 +                               page = saveable_highmem_page(pfn);
15198 +                       else
15199 +                               page = saveable_page(pfn);
15200 +
15201 +                       if (!page || PageNosave(page)) {
15202 +                               num_nosave++;
15203 +                               continue;
15204 +                       }
15205 +
15206 +                       if (PagePageset2(page)) {
15207 +                               pagedir2.size++;
15208 +                               if (PageHighMem(page))
15209 +                                       inc_highmem_size(pagedir2);
15210 +                               else
15211 +                                       SetPagePageset1Copy(page);
15212 +                               if (PageResave(page)) {
15213 +                                       SetPagePageset1(page);
15214 +                                       ClearPagePageset1Copy(page);
15215 +                                       pagedir1.size++;
15216 +                                       if (PageHighMem(page))
15217 +                                               inc_highmem_size(pagedir1);
15218 +                               }
15219 +                       } else {
15220 +                               pagedir1.size++;
15221 +                               SetPagePageset1(page);
15222 +                               if (PageHighMem(page))
15223 +                                       inc_highmem_size(pagedir1);
15224 +                       }
15225 +               }
15226 +       }
15227 +
15228 +       if (atomic_copy)
15229 +               return;
15230 +
15231 +       toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
15232 +               "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld) + "
15233 +               "NumFree (%d) = %d.\n",
15234 +               pagedir1.size, pagedir2.size, num_nosave, num_free,
15235 +               pagedir1.size + pagedir2.size + num_nosave + num_free);
15236 +}
15237 +
15238 +void toi_recalculate_image_contents(int atomic_copy)
15239 +{
15240 +       clear_dyn_pageflags(&pageset1_map);
15241 +       if (!atomic_copy) {
15242 +               int pfn;
15243 +               BITMAP_FOR_EACH_SET(&pageset2_map, pfn)
15244 +                       ClearPagePageset1Copy(pfn_to_page(pfn));
15245 +               /* Need to call this before getting pageset1_size! */
15246 +               toi_mark_pages_for_pageset2();
15247 +       }
15248 +       flag_image_pages(atomic_copy);
15249 +
15250 +       if (!atomic_copy) {
15251 +               storage_available = toiActiveAllocator->storage_available();
15252 +               display_stats(0, 0);
15253 +       }
15254 +}
15255 +
15256 +/* update_image
15257 + *
15258 + * Allocate [more] memory and storage for the image.
15259 + */
15260 +static void update_image(void)
15261 +{
15262 +       int wanted, got;
15263 +       long seek;
15264 +
15265 +       toi_recalculate_image_contents(0);
15266 +
15267 +       /* Include allowance for growth in pagedir1 while writing pagedir 2 */
15268 +       wanted = pagedir1.size +  extra_pd1_pages_allowance -
15269 +               get_lowmem_size(pagedir2);
15270 +       if (wanted > extra_pages_allocated) {
15271 +               got = toi_allocate_extra_pagedir_memory(wanted);
15272 +               if (wanted < got) {
15273 +                       toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
15274 +                               "Want %d extra pages for pageset1, got %d.\n",
15275 +                               wanted, got);
15276 +                       return;
15277 +               }
15278 +       }
15279 +
15280 +       thaw_kernel_threads();
15281 +
15282 +       /*
15283 +        * Allocate remaining storage space, if possible, up to the
15284 +        * maximum we know we'll need. It's okay to allocate the
15285 +        * maximum if the writer is the swapwriter, but
15286 +        * we don't want to grab all available space on an NFS share.
15287 +        * We therefore ignore the expected compression ratio here,
15288 +        * thereby trying to allocate the maximum image size we could
15289 +        * need (assuming compression doesn't expand the image), but
15290 +        * don't complain if we can't get the full amount we're after.
15291 +        */
15292 +
15293 +       storage_available = toiActiveAllocator->storage_available();
15294 +
15295 +       header_space_allocated = header_storage_needed();
15296 +
15297 +       toiActiveAllocator->reserve_header_space(header_space_allocated);
15298 +
15299 +       seek = min(storage_available, main_storage_needed(0, 0));
15300 +
15301 +       toiActiveAllocator->allocate_storage(seek);
15302 +
15303 +       main_storage_allocated = toiActiveAllocator->storage_allocated();
15304 +
15305 +       if (freeze_processes())
15306 +               set_abort_result(TOI_FREEZING_FAILED);
15307 +
15308 +       toi_recalculate_image_contents(0);
15309 +}
15310 +
15311 +/* attempt_to_freeze
15312 + *
15313 + * Try to freeze processes.
15314 + */
15315 +
15316 +static int attempt_to_freeze(void)
15317 +{
15318 +       int result;
15319 +
15320 +       /* Stop processes before checking again */
15321 +       thaw_processes();
15322 +       toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
15323 +                       "filesystems.");
15324 +       result = freeze_processes();
15325 +
15326 +       if (result)
15327 +               set_abort_result(TOI_FREEZING_FAILED);
15328 +
15329 +       return result;
15330 +}
15331 +
15332 +/* eat_memory
15333 + *
15334 + * Try to free some memory, either to meet hard or soft constraints on the image
15335 + * characteristics.
15336 + *
15337 + * Hard constraints:
15338 + * - Pageset1 must be < half of memory;
15339 + * - We must have enough memory free at resume time to have pageset1
15340 + *   be able to be loaded in pages that don't conflict with where it has to
15341 + *   be restored.
15342 + * Soft constraints
15343 + * - User specificied image size limit.
15344 + */
15345 +static void eat_memory(void)
15346 +{
15347 +       long amount_wanted = 0;
15348 +       int did_eat_memory = 0;
15349 +
15350 +       /*
15351 +        * Note that if we have enough storage space and enough free memory, we
15352 +        * may exit without eating anything. We give up when the last 10
15353 +        * iterations ate no extra pages because we're not going to get much
15354 +        * more anyway, but the few pages we get will take a lot of time.
15355 +        *
15356 +        * We freeze processes before beginning, and then unfreeze them if we
15357 +        * need to eat memory until we think we have enough. If our attempts
15358 +        * to freeze fail, we give up and abort.
15359 +        */
15360 +
15361 +       toi_recalculate_image_contents(0);
15362 +       amount_wanted = amount_needed(1);
15363 +
15364 +       switch (image_size_limit) {
15365 +       case -1: /* Don't eat any memory */
15366 +               if (amount_wanted > 0) {
15367 +                       set_abort_result(TOI_WOULD_EAT_MEMORY);
15368 +                       return;
15369 +               }
15370 +               break;
15371 +       case -2:  /* Free caches only */
15372 +               drop_pagecache();
15373 +               toi_recalculate_image_contents(0);
15374 +               amount_wanted = amount_needed(1);
15375 +               did_eat_memory = 1;
15376 +               break;
15377 +       default:
15378 +               break;
15379 +       }
15380 +
15381 +       if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
15382 +                       image_size_limit != -1) {
15383 +               struct zone *zone;
15384 +               int zone_idx;
15385 +
15386 +               toi_prepare_status(CLEAR_BAR,
15387 +                               "Seeking to free %ldMB of memory.",
15388 +                               MB(amount_wanted));
15389 +
15390 +               thaw_kernel_threads();
15391 +
15392 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
15393 +                       unsigned long zone_type_free = max_t(long,
15394 +                                       (zone_idx == ZONE_HIGHMEM) ?
15395 +                                       highpages_ps1_to_free() :
15396 +                                       lowpages_ps1_to_free(), amount_wanted);
15397 +
15398 +                       if (zone_type_free < 0)
15399 +                               break;
15400 +
15401 +                       for_each_zone(zone) {
15402 +                               if (zone_idx(zone) != zone_idx)
15403 +                                       continue;
15404 +
15405 +                               shrink_one_zone(zone, zone_type_free, 3);
15406 +
15407 +                               did_eat_memory = 1;
15408 +
15409 +                               toi_recalculate_image_contents(0);
15410 +
15411 +                               amount_wanted = amount_needed(1);
15412 +                               zone_type_free = max_t(long,
15413 +                                       (zone_idx == ZONE_HIGHMEM) ?
15414 +                                       highpages_ps1_to_free() :
15415 +                                       lowpages_ps1_to_free(), amount_wanted);
15416 +
15417 +                               if (zone_type_free < 0)
15418 +                                       break;
15419 +                       }
15420 +               }
15421 +
15422 +               toi_cond_pause(0, NULL);
15423 +
15424 +               if (freeze_processes())
15425 +                       set_abort_result(TOI_FREEZING_FAILED);
15426 +       }
15427 +
15428 +       if (did_eat_memory) {
15429 +               unsigned long orig_state = get_toi_state();
15430 +               /* Freeze_processes will call sys_sync too */
15431 +               restore_toi_state(orig_state);
15432 +               toi_recalculate_image_contents(0);
15433 +       }
15434 +
15435 +       /* Blank out image size display */
15436 +       toi_update_status(100, 100, NULL);
15437 +}
15438 +
15439 +/* toi_prepare_image
15440 + *
15441 + * Entry point to the whole image preparation section.
15442 + *
15443 + * We do four things:
15444 + * - Freeze processes;
15445 + * - Ensure image size constraints are met;
15446 + * - Complete all the preparation for saving the image,
15447 + *   including allocation of storage. The only memory
15448 + *   that should be needed when we're finished is that
15449 + *   for actually storing the image (and we know how
15450 + *   much is needed for that because the modules tell
15451 + *   us).
15452 + * - Make sure that all dirty buffers are written out.
15453 + */
15454 +#define MAX_TRIES 2
15455 +int toi_prepare_image(void)
15456 +{
15457 +       int result = 1, tries = 1;
15458 +
15459 +       header_space_allocated = 0;
15460 +       main_storage_allocated = 0;
15461 +
15462 +       if (attempt_to_freeze())
15463 +               return 1;
15464 +
15465 +       if (!extra_pd1_pages_allowance)
15466 +               get_extra_pd1_allowance();
15467 +
15468 +       storage_available = toiActiveAllocator->storage_available();
15469 +
15470 +       if (!storage_available) {
15471 +               printk(KERN_INFO "No storage available. Didn't try to prepare "
15472 +                               "an image.\n");
15473 +               display_failure_reason(0);
15474 +               set_abort_result(TOI_NOSTORAGE_AVAILABLE);
15475 +               return 1;
15476 +       }
15477 +
15478 +       if (build_attention_list()) {
15479 +               abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
15480 +                               "Unable to successfully prepare the image.\n");
15481 +               return 1;
15482 +       }
15483 +
15484 +       do {
15485 +               toi_prepare_status(CLEAR_BAR,
15486 +                               "Preparing Image. Try %d.", tries);
15487 +
15488 +               eat_memory();
15489 +
15490 +               if (test_result_state(TOI_ABORTED))
15491 +                       break;
15492 +
15493 +               update_image();
15494 +
15495 +               tries++;
15496 +
15497 +       } while (image_not_ready(1) && tries <= MAX_TRIES &&
15498 +                       !test_result_state(TOI_ABORTED));
15499 +
15500 +       result = image_not_ready(0);
15501 +
15502 +       if (!test_result_state(TOI_ABORTED)) {
15503 +               if (result) {
15504 +                       display_stats(1, 0);
15505 +                       display_failure_reason(tries > MAX_TRIES);
15506 +                       abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
15507 +                               "Unable to successfully prepare the image.\n");
15508 +               } else {
15509 +                       unlink_lru_lists();
15510 +                       toi_cond_pause(1, "Image preparation complete.");
15511 +               }
15512 +       }
15513 +
15514 +       return result ? result : allocate_checksum_pages();
15515 +}
15516 +
15517 +#ifdef CONFIG_TOI_EXPORTS
15518 +EXPORT_SYMBOL_GPL(real_nr_free_pages);
15519 +#endif
15520 diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
15521 new file mode 100644
15522 index 0000000..9604c79
15523 --- /dev/null
15524 +++ b/kernel/power/tuxonice_prepare_image.h
15525 @@ -0,0 +1,35 @@
15526 +/*
15527 + * kernel/power/tuxonice_prepare_image.h
15528 + *
15529 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
15530 + *
15531 + * This file is released under the GPLv2.
15532 + *
15533 + */
15534 +
15535 +#include <asm/sections.h>
15536 +
15537 +extern int toi_prepare_image(void);
15538 +extern void toi_recalculate_image_contents(int storage_available);
15539 +extern long real_nr_free_pages(unsigned long zone_idx_mask);
15540 +extern int image_size_limit;
15541 +extern void toi_free_extra_pagedir_memory(void);
15542 +extern long extra_pd1_pages_allowance;
15543 +extern void free_attention_list(void);
15544 +
15545 +#define MIN_FREE_RAM 100
15546 +#define DEFAULT_EXTRA_PAGES_ALLOWANCE 500
15547 +
15548 +#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
15549 +#ifdef CONFIG_HIGHMEM
15550 +#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
15551 +#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
15552 +                                               (1 << ZONE_HIGHMEM)))
15553 +#else
15554 +#define real_nr_free_high_pages() (0)
15555 +#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
15556 +
15557 +/* For eat_memory function */
15558 +#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
15559 +#endif
15560 +
15561 diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
15562 new file mode 100644
15563 index 0000000..777ff3c
15564 --- /dev/null
15565 +++ b/kernel/power/tuxonice_storage.c
15566 @@ -0,0 +1,292 @@
15567 +/*
15568 + * kernel/power/tuxonice_storage.c
15569 + *
15570 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
15571 + *
15572 + * This file is released under the GPLv2.
15573 + *
15574 + * Routines for talking to a userspace program that manages storage.
15575 + *
15576 + * The kernel side:
15577 + * - starts the userspace program;
15578 + * - sends messages telling it when to open and close the connection;
15579 + * - tells it when to quit;
15580 + *
15581 + * The user space side:
15582 + * - passes messages regarding status;
15583 + *
15584 + */
15585 +
15586 +#include <linux/suspend.h>
15587 +#include <linux/freezer.h>
15588 +
15589 +#include "tuxonice_sysfs.h"
15590 +#include "tuxonice_modules.h"
15591 +#include "tuxonice_netlink.h"
15592 +#include "tuxonice_storage.h"
15593 +#include "tuxonice_ui.h"
15594 +
15595 +static struct user_helper_data usm_helper_data;
15596 +static struct toi_module_ops usm_ops;
15597 +static int message_received, usm_prepare_count;
15598 +static int storage_manager_last_action, storage_manager_action;
15599 +
15600 +static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
15601 +{
15602 +       int type;
15603 +       int *data;
15604 +
15605 +       type = nlh->nlmsg_type;
15606 +
15607 +       /* A control message: ignore them */
15608 +       if (type < NETLINK_MSG_BASE)
15609 +               return 0;
15610 +
15611 +       /* Unknown message: reply with EINVAL */
15612 +       if (type >= USM_MSG_MAX)
15613 +               return -EINVAL;
15614 +
15615 +       /* All operations require privileges, even GET */
15616 +       if (security_netlink_recv(skb, CAP_NET_ADMIN))
15617 +               return -EPERM;
15618 +
15619 +       /* Only allow one task to receive NOFREEZE privileges */
15620 +       if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
15621 +               return -EBUSY;
15622 +
15623 +       data = (int *) NLMSG_DATA(nlh);
15624 +
15625 +       switch (type) {
15626 +       case USM_MSG_SUCCESS:
15627 +       case USM_MSG_FAILED:
15628 +               message_received = type;
15629 +               complete(&usm_helper_data.wait_for_process);
15630 +               break;
15631 +       default:
15632 +               printk(KERN_INFO "Storage manager doesn't recognise "
15633 +                               "message %d.\n", type);
15634 +       }
15635 +
15636 +       return 1;
15637 +}
15638 +
15639 +#ifdef CONFIG_NET
15640 +static int activations;
15641 +
15642 +int toi_activate_storage(int force)
15643 +{
15644 +       int tries = 1;
15645 +
15646 +       if (usm_helper_data.pid == -1 || !usm_ops.enabled)
15647 +               return 0;
15648 +
15649 +       message_received = 0;
15650 +       activations++;
15651 +
15652 +       if (activations > 1 && !force)
15653 +               return 0;
15654 +
15655 +       while ((!message_received || message_received == USM_MSG_FAILED) &&
15656 +                       tries < 2) {
15657 +               toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
15658 +                               "%d.\n", tries);
15659 +
15660 +               init_completion(&usm_helper_data.wait_for_process);
15661 +
15662 +               toi_send_netlink_message(&usm_helper_data,
15663 +                       USM_MSG_CONNECT,
15664 +                       NULL, 0);
15665 +
15666 +               /* Wait 2 seconds for the userspace process to make contact */
15667 +               wait_for_completion_timeout(&usm_helper_data.wait_for_process,
15668 +                               2*HZ);
15669 +
15670 +               tries++;
15671 +       }
15672 +
15673 +       return 0;
15674 +}
15675 +
15676 +int toi_deactivate_storage(int force)
15677 +{
15678 +       if (usm_helper_data.pid == -1 || !usm_ops.enabled)
15679 +               return 0;
15680 +
15681 +       message_received = 0;
15682 +       activations--;
15683 +
15684 +       if (activations && !force)
15685 +               return 0;
15686 +
15687 +       init_completion(&usm_helper_data.wait_for_process);
15688 +
15689 +       toi_send_netlink_message(&usm_helper_data,
15690 +                       USM_MSG_DISCONNECT,
15691 +                       NULL, 0);
15692 +
15693 +       wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
15694 +
15695 +       if (!message_received || message_received == USM_MSG_FAILED) {
15696 +               printk(KERN_INFO "Returning failure disconnecting storage.\n");
15697 +               return 1;
15698 +       }
15699 +
15700 +       return 0;
15701 +}
15702 +#endif
15703 +
15704 +static void storage_manager_simulate(void)
15705 +{
15706 +       printk(KERN_INFO "--- Storage manager simulate ---\n");
15707 +       toi_prepare_usm();
15708 +       schedule();
15709 +       printk(KERN_INFO "--- Activate storage 1 ---\n");
15710 +       toi_activate_storage(1);
15711 +       schedule();
15712 +       printk(KERN_INFO "--- Deactivate storage 1 ---\n");
15713 +       toi_deactivate_storage(1);
15714 +       schedule();
15715 +       printk(KERN_INFO "--- Cleanup usm ---\n");
15716 +       toi_cleanup_usm();
15717 +       schedule();
15718 +       printk(KERN_INFO "--- Storage manager simulate ends ---\n");
15719 +}
15720 +
15721 +static int usm_storage_needed(void)
15722 +{
15723 +       return strlen(usm_helper_data.program);
15724 +}
15725 +
15726 +static int usm_save_config_info(char *buf)
15727 +{
15728 +       int len = strlen(usm_helper_data.program);
15729 +       memcpy(buf, usm_helper_data.program, len);
15730 +       return len;
15731 +}
15732 +
15733 +static void usm_load_config_info(char *buf, int size)
15734 +{
15735 +       /* Don't load the saved path if one has already been set */
15736 +       if (usm_helper_data.program[0])
15737 +               return;
15738 +
15739 +       memcpy(usm_helper_data.program, buf, size);
15740 +}
15741 +
15742 +static int usm_memory_needed(void)
15743 +{
15744 +       /* ball park figure of 32 pages */
15745 +       return (32 * PAGE_SIZE);
15746 +}
15747 +
15748 +/* toi_prepare_usm
15749 + */
15750 +int toi_prepare_usm(void)
15751 +{
15752 +       usm_prepare_count++;
15753 +
15754 +       if (usm_prepare_count > 1 || !usm_ops.enabled)
15755 +               return 0;
15756 +
15757 +       usm_helper_data.pid = -1;
15758 +
15759 +       if (!*usm_helper_data.program)
15760 +               return 0;
15761 +
15762 +       toi_netlink_setup(&usm_helper_data);
15763 +
15764 +       if (usm_helper_data.pid == -1)
15765 +               printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
15766 +                               " start it.\n");
15767 +
15768 +       toi_activate_storage(0);
15769 +
15770 +       return (usm_helper_data.pid != -1);
15771 +}
15772 +
15773 +void toi_cleanup_usm(void)
15774 +{
15775 +       usm_prepare_count--;
15776 +
15777 +       if (usm_helper_data.pid > -1 && !usm_prepare_count) {
15778 +               toi_deactivate_storage(0);
15779 +               toi_netlink_close(&usm_helper_data);
15780 +       }
15781 +}
15782 +
15783 +static void storage_manager_activate(void)
15784 +{
15785 +       if (storage_manager_action == storage_manager_last_action)
15786 +               return;
15787 +
15788 +       if (storage_manager_action)
15789 +               toi_prepare_usm();
15790 +       else
15791 +               toi_cleanup_usm();
15792 +
15793 +       storage_manager_last_action = storage_manager_action;
15794 +}
15795 +
15796 +/*
15797 + * User interface specific /sys/power/tuxonice entries.
15798 + */
15799 +
15800 +static struct toi_sysfs_data sysfs_params[] = {
15801 +       { TOI_ATTR("simulate_atomic_copy", SYSFS_RW),
15802 +         .type                         = TOI_SYSFS_DATA_NONE,
15803 +         .write_side_effect            = storage_manager_simulate,
15804 +       },
15805 +
15806 +       { TOI_ATTR("enabled", SYSFS_RW),
15807 +         SYSFS_INT(&usm_ops.enabled, 0, 1, 0)
15808 +       },
15809 +
15810 +       { TOI_ATTR("program", SYSFS_RW),
15811 +         SYSFS_STRING(usm_helper_data.program, 254, 0)
15812 +       },
15813 +
15814 +       { TOI_ATTR("activate_storage", SYSFS_RW),
15815 +         SYSFS_INT(&storage_manager_action, 0, 1, 0),
15816 +         .write_side_effect            = storage_manager_activate,
15817 +       }
15818 +};
15819 +
15820 +static struct toi_module_ops usm_ops = {
15821 +       .type                           = MISC_MODULE,
15822 +       .name                           = "usm",
15823 +       .directory                      = "storage_manager",
15824 +       .module                         = THIS_MODULE,
15825 +       .storage_needed                 = usm_storage_needed,
15826 +       .save_config_info               = usm_save_config_info,
15827 +       .load_config_info               = usm_load_config_info,
15828 +       .memory_needed                  = usm_memory_needed,
15829 +
15830 +       .sysfs_data                     = sysfs_params,
15831 +       .num_sysfs_entries              = sizeof(sysfs_params) /
15832 +               sizeof(struct toi_sysfs_data),
15833 +};
15834 +
15835 +/* toi_usm_sysfs_init
15836 + * Description: Boot time initialisation for user interface.
15837 + */
15838 +int toi_usm_init(void)
15839 +{
15840 +       usm_helper_data.nl = NULL;
15841 +       usm_helper_data.program[0] = '\0';
15842 +       usm_helper_data.pid = -1;
15843 +       usm_helper_data.skb_size = 0;
15844 +       usm_helper_data.pool_limit = 6;
15845 +       usm_helper_data.netlink_id = NETLINK_TOI_USM;
15846 +       usm_helper_data.name = "userspace storage manager";
15847 +       usm_helper_data.rcv_msg = usm_user_rcv_msg;
15848 +       usm_helper_data.interface_version = 1;
15849 +       usm_helper_data.must_init = 0;
15850 +       init_completion(&usm_helper_data.wait_for_process);
15851 +
15852 +       return toi_register_module(&usm_ops);
15853 +}
15854 +
15855 +void toi_usm_exit(void)
15856 +{
15857 +       toi_unregister_module(&usm_ops);
15858 +}
15859 diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
15860 new file mode 100644
15861 index 0000000..2f895bf
15862 --- /dev/null
15863 +++ b/kernel/power/tuxonice_storage.h
15864 @@ -0,0 +1,53 @@
15865 +/*
15866 + * kernel/power/tuxonice_storage.h
15867 + *
15868 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
15869 + *
15870 + * This file is released under the GPLv2.
15871 + */
15872 +
15873 +#ifdef CONFIG_NET
15874 +int toi_prepare_usm(void);
15875 +void toi_cleanup_usm(void);
15876 +
15877 +int toi_activate_storage(int force);
15878 +int toi_deactivate_storage(int force);
15879 +extern int toi_usm_init(void);
15880 +extern void toi_usm_exit(void);
15881 +#else
15882 +static inline int toi_usm_init(void) { return 0; }
15883 +static inline void toi_usm_exit(void) { }
15884 +
15885 +static inline int toi_activate_storage(int force)
15886 +{
15887 +       return 0;
15888 +}
15889 +
15890 +static inline int toi_deactivate_storage(int force)
15891 +{
15892 +       return 0;
15893 +}
15894 +
15895 +static inline int toi_prepare_usm(void) { return 0; }
15896 +static inline void toi_cleanup_usm(void) { }
15897 +#endif
15898 +
15899 +enum {
15900 +       USM_MSG_BASE = 0x10,
15901 +
15902 +       /* Kernel -> Userspace */
15903 +       USM_MSG_CONNECT = 0x30,
15904 +       USM_MSG_DISCONNECT = 0x31,
15905 +       USM_MSG_SUCCESS = 0x40,
15906 +       USM_MSG_FAILED = 0x41,
15907 +
15908 +       USM_MSG_MAX,
15909 +};
15910 +
15911 +#ifdef CONFIG_NET
15912 +extern __init int toi_usm_init(void);
15913 +extern __exit void toi_usm_cleanup(void);
15914 +#else
15915 +#define toi_usm_init() do { } while (0)
15916 +#define toi_usm_cleanup() do { } while (0)
15917 +#endif
15918 diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
15919 new file mode 100644
15920 index 0000000..3cecaa0
15921 --- /dev/null
15922 +++ b/kernel/power/tuxonice_swap.c
15923 @@ -0,0 +1,1280 @@
15924 +/*
15925 + * kernel/power/tuxonice_swap.c
15926 + *
15927 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
15928 + *
15929 + * Distributed under GPLv2.
15930 + *
15931 + * This file encapsulates functions for usage of swap space as a
15932 + * backing store.
15933 + */
15934 +
15935 +#include <linux/suspend.h>
15936 +#include <linux/module.h>
15937 +#include <linux/blkdev.h>
15938 +#include <linux/swapops.h>
15939 +#include <linux/swap.h>
15940 +#include <linux/syscalls.h>
15941 +
15942 +#include "tuxonice.h"
15943 +#include "tuxonice_sysfs.h"
15944 +#include "tuxonice_modules.h"
15945 +#include "tuxonice_io.h"
15946 +#include "tuxonice_ui.h"
15947 +#include "tuxonice_extent.h"
15948 +#include "tuxonice_block_io.h"
15949 +#include "tuxonice_alloc.h"
15950 +
15951 +static struct toi_module_ops toi_swapops;
15952 +
15953 +/* --- Struct of pages stored on disk */
15954 +
15955 +struct tuxonice_sig_data {
15956 +       dev_t device;
15957 +       unsigned long sector;
15958 +       int resume_attempted;
15959 +       int orig_sig_type;
15960 +};
15961 +
15962 +union diskpage {
15963 +       union swap_header swh;  /* swh.magic is the only member used */
15964 +       struct tuxonice_sig_data tuxonice_sig_data;
15965 +};
15966 +
15967 +union p_diskpage {
15968 +       union diskpage *pointer;
15969 +       char *ptr;
15970 +       unsigned long address;
15971 +};
15972 +
15973 +enum {
15974 +       IMAGE_SIGNATURE,
15975 +       NO_IMAGE_SIGNATURE,
15976 +       TRIED_RESUME,
15977 +       NO_TRIED_RESUME,
15978 +};
15979 +
15980 +/*
15981 + * Both of these point to versions of the swap header page. original_sig points
15982 + * to the data we read from disk at the start of hibernating or checking whether
15983 + * to resume. no_image is the page stored in the image header, showing what the
15984 + * swap header page looked like at the start of hibernating.
15985 + */
15986 +static char *current_signature_page;
15987 +static char no_image_signature_contents[sizeof(struct tuxonice_sig_data)];
15988 +
15989 +/* Devices used for swap */
15990 +static struct toi_bdev_info devinfo[MAX_SWAPFILES];
15991 +
15992 +/* Extent chains for swap & blocks */
15993 +struct extent_chain swapextents;
15994 +struct extent_chain block_chain[MAX_SWAPFILES];
15995 +
15996 +static dev_t header_dev_t;
15997 +static struct block_device *header_block_device;
15998 +static unsigned long headerblock;
15999 +
16000 +/* For swapfile automatically swapon/off'd. */
16001 +static char swapfilename[32] = "";
16002 +static int toi_swapon_status;
16003 +
16004 +/* Header Page Information */
16005 +static long header_pages_reserved;
16006 +
16007 +/* Swap Pages */
16008 +static long swap_pages_allocated;
16009 +
16010 +/* User Specified Parameters. */
16011 +
16012 +static unsigned long resume_firstblock;
16013 +static dev_t resume_swap_dev_t;
16014 +static struct block_device *resume_block_device;
16015 +
16016 +struct sysinfo swapinfo;
16017 +
16018 +/* Block devices open. */
16019 +struct bdev_opened {
16020 +       dev_t device;
16021 +       struct block_device *bdev;
16022 +};
16023 +
16024 +/*
16025 + * Entry MAX_SWAPFILES is the resume block device, which may
16026 + * be a swap device not enabled when we hibernate.
16027 + * Entry MAX_SWAPFILES + 1 is the header block device, which
16028 + * is needed before we find out which slot it occupies.
16029 + *
16030 + * We use a separate struct to devInfo so that we can track
16031 + * the bdevs we open, because if we need to abort resuming
16032 + * prior to the atomic restore, they need to be closed, but
16033 + * closing them after sucessfully resuming would be wrong.
16034 + */
16035 +static struct bdev_opened *bdevs_opened[MAX_SWAPFILES + 2];
16036 +
16037 +/**
16038 + * close_bdev: Close a swap bdev.
16039 + *
16040 + * int: The swap entry number to close.
16041 + */
16042 +static void close_bdev(int i)
16043 +{
16044 +       struct bdev_opened *this = bdevs_opened[i];
16045 +
16046 +       if (!this)
16047 +               return;
16048 +
16049 +       blkdev_put(this->bdev);
16050 +       toi_kfree(8, this);
16051 +       bdevs_opened[i] = NULL;
16052 +}
16053 +
16054 +/**
16055 + * close_bdevs: Close all bdevs we opened.
16056 + *
16057 + * Close all bdevs that we opened and reset the related vars.
16058 + */
16059 +static void close_bdevs(void)
16060 +{
16061 +       int i;
16062 +
16063 +       for (i = 0; i < MAX_SWAPFILES + 2; i++)
16064 +               close_bdev(i);
16065 +
16066 +       resume_block_device = header_block_device = NULL;
16067 +}
16068 +
16069 +/**
16070 + * open_bdev: Open a bdev at resume time.
16071 + *
16072 + * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
16073 + * (the user can have resume= pointing at a swap partition/file that isn't
16074 + * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
16075 + * header. It will be from a swap partition that was enabled when we hibernated,
16076 + * but we don't know it's real index until we read that first page.
16077 + * dev_t: The device major/minor.
16078 + * display_errs: Whether to try to do this quietly.
16079 + *
16080 + * We stored a dev_t in the image header. Open the matching device without
16081 + * requiring /dev/<whatever> in most cases and record the details needed
16082 + * to close it later and avoid duplicating work.
16083 + */
16084 +static struct block_device *open_bdev(int index, dev_t device, int display_errs)
16085 +{
16086 +       struct bdev_opened *this;
16087 +       struct block_device *bdev;
16088 +
16089 +       if (bdevs_opened[index]) {
16090 +               if (bdevs_opened[index]->device == device)
16091 +                       return bdevs_opened[index]->bdev;
16092 +
16093 +               close_bdev(index);
16094 +       }
16095 +
16096 +       bdev = toi_open_by_devnum(device, FMODE_READ);
16097 +
16098 +       if (IS_ERR(bdev) || !bdev) {
16099 +               if (display_errs)
16100 +                       toi_early_boot_message(1, TOI_CONTINUE_REQ,
16101 +                               "Failed to get access to block device "
16102 +                               "\"%x\" (error %d).\n Maybe you need "
16103 +                               "to run mknod and/or lvmsetup in an "
16104 +                               "initrd/ramfs?", device, bdev);
16105 +               return ERR_PTR(-EINVAL);
16106 +       }
16107 +
16108 +       this = toi_kzalloc(8, sizeof(struct bdev_opened), GFP_KERNEL);
16109 +       if (!this) {
16110 +               printk(KERN_WARNING "TuxOnIce: Failed to allocate memory for "
16111 +                               "opening a bdev.");
16112 +               blkdev_put(bdev);
16113 +               return ERR_PTR(-ENOMEM);
16114 +       }
16115 +
16116 +       bdevs_opened[index] = this;
16117 +       this->device = device;
16118 +       this->bdev = bdev;
16119 +
16120 +       return bdev;
16121 +}
16122 +
16123 +/**
16124 + * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
16125 + *
16126 + * Activate the given swapfile if it wasn't already enabled. Remember whether
16127 + * we really did swapon it for swapoffing later.
16128 + */
16129 +static void enable_swapfile(void)
16130 +{
16131 +       int activateswapresult = -EINVAL;
16132 +
16133 +       if (swapfilename[0]) {
16134 +               /* Attempt to swap on with maximum priority */
16135 +               activateswapresult = sys_swapon(swapfilename, 0xFFFF);
16136 +               if (activateswapresult && activateswapresult != -EBUSY)
16137 +                       printk("TuxOnIce: The swapfile/partition specified by "
16138 +                               "/sys/power/tuxonice/swap/swapfile "
16139 +                               "(%s) could not be turned on (error %d). "
16140 +                               "Attempting to continue.\n",
16141 +                               swapfilename, activateswapresult);
16142 +               if (!activateswapresult)
16143 +                       toi_swapon_status = 1;
16144 +       }
16145 +}
16146 +
16147 +/**
16148 + * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
16149 + *
16150 + * If we did successfully swapon a file at the start of the cycle, swapoff
16151 + * it now (finishing up).
16152 + */
16153 +static void disable_swapfile(void)
16154 +{
16155 +       if (!toi_swapon_status)
16156 +               return;
16157 +
16158 +       sys_swapoff(swapfilename);
16159 +       toi_swapon_status = 0;
16160 +}
16161 +
16162 +/**
16163 + * try_to_parse_resume_device: Try to parse resume=
16164 + *
16165 + * Any "swap:" has been stripped away and we just have the path to deal with.
16166 + * We attempt to do name_to_dev_t, open and stat the file. Having opened the
16167 + * file, get the struct block_device * to match.
16168 + */
16169 +static int try_to_parse_resume_device(char *commandline, int quiet)
16170 +{
16171 +       struct kstat stat;
16172 +       int error = 0;
16173 +
16174 +       resume_swap_dev_t = name_to_dev_t(commandline);
16175 +
16176 +       if (!resume_swap_dev_t) {
16177 +               struct file *file = filp_open(commandline,
16178 +                               O_RDONLY|O_LARGEFILE, 0);
16179 +
16180 +               if (!IS_ERR(file) && file) {
16181 +                       vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
16182 +                       filp_close(file, NULL);
16183 +               } else
16184 +                       error = vfs_stat(commandline, &stat);
16185 +               if (!error)
16186 +                       resume_swap_dev_t = stat.rdev;
16187 +       }
16188 +
16189 +       if (!resume_swap_dev_t) {
16190 +               if (quiet)
16191 +                       return 1;
16192 +
16193 +               if (test_toi_state(TOI_TRYING_TO_RESUME))
16194 +                       toi_early_boot_message(1, TOI_CONTINUE_REQ,
16195 +                         "Failed to translate \"%s\" into a device id.\n",
16196 +                         commandline);
16197 +               else
16198 +                       printk("TuxOnIce: Can't translate \"%s\" into a device "
16199 +                                       "id yet.\n", commandline);
16200 +               return 1;
16201 +       }
16202 +
16203 +       resume_block_device = open_bdev(MAX_SWAPFILES, resume_swap_dev_t, 0);
16204 +       if (IS_ERR(resume_block_device)) {
16205 +               if (!quiet)
16206 +                       toi_early_boot_message(1, TOI_CONTINUE_REQ,
16207 +                               "Failed to get access to \"%s\", where"
16208 +                               " the swap header should be found.",
16209 +                               commandline);
16210 +               return 1;
16211 +       }
16212 +
16213 +       return 0;
16214 +}
16215 +
16216 +/*
16217 + * If we have read part of the image, we might have filled  memory with
16218 + * data that should be zeroed out.
16219 + */
16220 +static void toi_swap_noresume_reset(void)
16221 +{
16222 +       toi_bio_ops.rw_cleanup(READ);
16223 +       memset((char *) &devinfo, 0, sizeof(devinfo));
16224 +}
16225 +
16226 +static int get_current_signature(void)
16227 +{
16228 +       int result;
16229 +
16230 +       if (current_signature_page)
16231 +               return 0;
16232 +
16233 +       current_signature_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
16234 +       if (!current_signature_page)
16235 +               return -ENOMEM;
16236 +
16237 +       result = toi_bio_ops.bdev_page_io(READ, resume_block_device,
16238 +               resume_firstblock, virt_to_page(current_signature_page));
16239 +
16240 +       return result;
16241 +}
16242 +
16243 +static int parse_signature(void)
16244 +{
16245 +       union p_diskpage swap_header_page;
16246 +       struct tuxonice_sig_data *sig;
16247 +       int type;
16248 +       char *swap_header;
16249 +       const char *sigs[] = {
16250 +               "SWAP-SPACE", "SWAPSPACE2", "S1SUSP", "S2SUSP", "S1SUSPEND"
16251 +       };
16252 +
16253 +       if (!current_signature_page) {
16254 +               int result = get_current_signature();
16255 +
16256 +               if (result)
16257 +                       return result;
16258 +       }
16259 +
16260 +       swap_header_page = (union p_diskpage) current_signature_page;
16261 +       sig = (struct tuxonice_sig_data *) current_signature_page;
16262 +       swap_header = swap_header_page.pointer->swh.magic.magic;
16263 +
16264 +       for (type = 0; type < 5; type++)
16265 +               if (!memcmp(sigs[type], swap_header, strlen(sigs[type])))
16266 +                       return type;
16267 +
16268 +       if (memcmp(tuxonice_signature, swap_header, 1))
16269 +               return -1;
16270 +
16271 +       header_dev_t = sig->device;
16272 +       clear_toi_state(TOI_RESUMED_BEFORE);
16273 +       if (sig->resume_attempted)
16274 +               set_toi_state(TOI_RESUMED_BEFORE);
16275 +       headerblock = sig->sector;
16276 +
16277 +       return 10;
16278 +}
16279 +
16280 +static void forget_signatures(void)
16281 +{
16282 +       if (current_signature_page) {
16283 +               toi_free_page(38, (unsigned long) current_signature_page);
16284 +               current_signature_page = NULL;
16285 +       }
16286 +}
16287 +
16288 +/*
16289 + * write_modified_signature
16290 + *
16291 + * Write a (potentially) modified signature page without forgetting the
16292 + * original contents.
16293 + */
16294 +static int write_modified_signature(int modification)
16295 +{
16296 +       union p_diskpage swap_header_page;
16297 +       struct swap_info_struct *si;
16298 +       int result;
16299 +       char *orig_sig;
16300 +
16301 +       /* In case we haven't already */
16302 +       result = get_current_signature();
16303 +
16304 +       if (result)
16305 +               return result;
16306 +
16307 +       swap_header_page.address = toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
16308 +
16309 +       if (!swap_header_page.address)
16310 +               return -ENOMEM;
16311 +
16312 +       memcpy(swap_header_page.ptr, current_signature_page, PAGE_SIZE);
16313 +
16314 +       switch (modification) {
16315 +       case IMAGE_SIGNATURE:
16316 +
16317 +               memcpy(no_image_signature_contents, swap_header_page.ptr,
16318 +                               sizeof(no_image_signature_contents));
16319 +
16320 +               /* Get the details of the header first page. */
16321 +               toi_extent_state_goto_start(&toi_writer_posn);
16322 +               toi_bio_ops.forward_one_page(1);
16323 +
16324 +               si = get_swap_info_struct(toi_writer_posn.current_chain);
16325 +
16326 +               /* Prepare the signature */
16327 +               swap_header_page.pointer->tuxonice_sig_data.device =
16328 +                       si->bdev->bd_dev;
16329 +               swap_header_page.pointer->tuxonice_sig_data.sector =
16330 +                       toi_writer_posn.current_offset;
16331 +               swap_header_page.pointer->tuxonice_sig_data.resume_attempted =0;
16332 +               swap_header_page.pointer->tuxonice_sig_data.orig_sig_type =
16333 +                       parse_signature();
16334 +
16335 +               memcpy(swap_header_page.pointer->swh.magic.magic,
16336 +                               tuxonice_signature, sizeof(tuxonice_signature));
16337 +
16338 +               break;
16339 +       case NO_IMAGE_SIGNATURE:
16340 +               if (!swap_header_page.pointer->tuxonice_sig_data.orig_sig_type)
16341 +                       orig_sig = "SWAP-SPACE";
16342 +               else
16343 +                       orig_sig = "SWAPSPACE2";
16344 +
16345 +               memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
16346 +               memcpy(swap_header_page.ptr, no_image_signature_contents,
16347 +                               sizeof(no_image_signature_contents));
16348 +               break;
16349 +       case TRIED_RESUME:
16350 +               swap_header_page.pointer->tuxonice_sig_data.resume_attempted =1;
16351 +               break;
16352 +       case NO_TRIED_RESUME:
16353 +               swap_header_page.pointer->tuxonice_sig_data.resume_attempted =0;
16354 +               break;
16355 +       }
16356 +
16357 +       result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
16358 +               resume_firstblock, virt_to_page(swap_header_page.address));
16359 +
16360 +       toi_free_page(38, swap_header_page.address);
16361 +
16362 +       return result;
16363 +}
16364 +
16365 +static int apply_header_reservation(void)
16366 +{
16367 +       int i;
16368 +
16369 +       toi_extent_state_goto_start(&toi_writer_posn);
16370 +       toi_bio_ops.forward_one_page(1); /* To first page */
16371 +
16372 +       for (i = 0; i < header_pages_reserved; i++) {
16373 +               if (toi_bio_ops.forward_one_page(1)) {
16374 +                       printk(KERN_INFO "Out of space while seeking to "
16375 +                                       "allocate header pages,\n");
16376 +                       return -ENOSPC;
16377 +               }
16378 +       }
16379 +
16380 +       /* The end of header pages will be the start of pageset 2;
16381 +        * we are now sitting on the first pageset2 page. */
16382 +       toi_extent_state_save(&toi_writer_posn, &toi_writer_posn_save[2]);
16383 +       return 0;
16384 +}
16385 +
16386 +static void toi_swap_reserve_header_space(int request)
16387 +{
16388 +       header_pages_reserved = (long) request;
16389 +
16390 +       /* If we've already allocated storage (hence ignoring return value): */
16391 +       apply_header_reservation();
16392 +}
16393 +
16394 +static void free_block_chains(void)
16395 +{
16396 +       int i;
16397 +
16398 +       for (i = 0; i < MAX_SWAPFILES; i++)
16399 +               if (block_chain[i].first)
16400 +                       toi_put_extent_chain(&block_chain[i]);
16401 +}
16402 +
16403 +static int add_blocks_to_extent_chain(int chain, int minimum, int maximum)
16404 +{
16405 +       if (test_action_state(TOI_TEST_BIO))
16406 +               printk(KERN_INFO "Adding extent chain %d %d-%d.\n", chain,
16407 +                               minimum << devinfo[chain].bmap_shift,
16408 +                               maximum << devinfo[chain].bmap_shift);
16409 +
16410 +       if (toi_add_to_extent_chain(&block_chain[chain], minimum, maximum)) {
16411 +               free_block_chains();
16412 +               return -ENOMEM;
16413 +       }
16414 +
16415 +       return 0;
16416 +}
16417 +
16418 +
16419 +static int get_main_pool_phys_params(void)
16420 +{
16421 +       struct extent *extentpointer = NULL;
16422 +       unsigned long address;
16423 +       int extent_min = -1, extent_max = -1, last_chain = -1;
16424 +
16425 +       free_block_chains();
16426 +
16427 +       toi_extent_for_each(&swapextents, extentpointer, address) {
16428 +               swp_entry_t swap_address = extent_val_to_swap_entry(address);
16429 +               pgoff_t offset = swp_offset(swap_address);
16430 +               unsigned swapfilenum = swp_type(swap_address);
16431 +               struct swap_info_struct *sis =
16432 +                       get_swap_info_struct(swapfilenum);
16433 +               sector_t new_sector = map_swap_page(sis, offset);
16434 +
16435 +               if ((new_sector == extent_max + 1) &&
16436 +                   (last_chain == swapfilenum)) {
16437 +                       extent_max++;
16438 +                       continue;
16439 +               }
16440 +
16441 +               if (extent_min > -1 && add_blocks_to_extent_chain(last_chain,
16442 +                                       extent_min, extent_max))
16443 +                       return -ENOMEM;
16444 +
16445 +               extent_min = extent_max = new_sector;
16446 +               last_chain = swapfilenum;
16447 +       }
16448 +
16449 +       if (extent_min > -1 && add_blocks_to_extent_chain(last_chain,
16450 +                               extent_min, extent_max))
16451 +                       return -ENOMEM;
16452 +
16453 +       return apply_header_reservation();
16454 +}
16455 +
16456 +static long raw_to_real(long raw)
16457 +{
16458 +       long result;
16459 +
16460 +       result = raw - (raw * (sizeof(unsigned long) + sizeof(int)) +
16461 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
16462 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
16463 +
16464 +       return result < 0 ? 0 : result;
16465 +}
16466 +
16467 +static int toi_swap_storage_allocated(void)
16468 +{
16469 +       return (int) raw_to_real(swap_pages_allocated - header_pages_reserved);
16470 +}
16471 +
16472 +/*
16473 + * We can't just remember the value from allocation time, because other
16474 + * processes might have allocated swap in the mean time.
16475 + */
16476 +static int toi_swap_storage_available(void)
16477 +{
16478 +       si_swapinfo(&swapinfo);
16479 +       return (int) raw_to_real((long) swapinfo.freeswap +
16480 +                       swap_pages_allocated - header_pages_reserved);
16481 +}
16482 +
16483 +static int toi_swap_initialise(int starting_cycle)
16484 +{
16485 +       if (!starting_cycle)
16486 +               return 0;
16487 +
16488 +       enable_swapfile();
16489 +
16490 +       if (resume_swap_dev_t && !resume_block_device &&
16491 +           IS_ERR(resume_block_device =
16492 +                       open_bdev(MAX_SWAPFILES, resume_swap_dev_t, 1)))
16493 +               return 1;
16494 +
16495 +       return 0;
16496 +}
16497 +
16498 +static void toi_swap_cleanup(int ending_cycle)
16499 +{
16500 +       if (ending_cycle)
16501 +               disable_swapfile();
16502 +
16503 +       close_bdevs();
16504 +
16505 +       forget_signatures();
16506 +}
16507 +
16508 +static int toi_swap_release_storage(void)
16509 +{
16510 +       if (test_action_state(TOI_KEEP_IMAGE) &&
16511 +           test_toi_state(TOI_NOW_RESUMING))
16512 +               return 0;
16513 +
16514 +       header_pages_reserved = 0;
16515 +       swap_pages_allocated = 0;
16516 +
16517 +       if (swapextents.first) {
16518 +               /* Free swap entries */
16519 +               struct extent *extentpointer;
16520 +               unsigned long extentvalue;
16521 +               toi_extent_for_each(&swapextents, extentpointer,
16522 +                               extentvalue)
16523 +                       swap_free(extent_val_to_swap_entry(extentvalue));
16524 +
16525 +               toi_put_extent_chain(&swapextents);
16526 +
16527 +               free_block_chains();
16528 +       }
16529 +
16530 +       return 0;
16531 +}
16532 +
16533 +static void free_swap_range(unsigned long min, unsigned long max)
16534 +{
16535 +       int j;
16536 +
16537 +       for (j = min; j <= max; j++)
16538 +               swap_free(extent_val_to_swap_entry(j));
16539 +}
16540 +
16541 +/*
16542 + * Round robin allocation (where swap storage has the same priority).
16543 + * could make this very inefficient, so we track extents allocated on
16544 + * a per-swapfile basis.
16545 + *
16546 + * We ignore here the fact that some space is for the header and doesn't
16547 + * have the overhead. It will only rarely make a 1 page difference.
16548 + */
16549 +static int toi_swap_allocate_storage(int request)
16550 +{
16551 +       int i, result = 0, to_add[MAX_SWAPFILES], pages_to_get, extra_pages,
16552 +           gotten = 0;
16553 +       unsigned long extent_min[MAX_SWAPFILES], extent_max[MAX_SWAPFILES];
16554 +
16555 +       extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
16556 +                              + sizeof(int)), PAGE_SIZE);
16557 +       pages_to_get = request + extra_pages - swapextents.size;
16558 +
16559 +       if (pages_to_get < 1)
16560 +               return 0;
16561 +
16562 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16563 +               struct swap_info_struct *si = get_swap_info_struct(i);
16564 +               to_add[i] = 0;
16565 +               if (!si->bdev)
16566 +                       continue;
16567 +               devinfo[i].bdev = si->bdev;
16568 +               devinfo[i].dev_t = si->bdev->bd_dev;
16569 +               devinfo[i].bmap_shift = 3;
16570 +               devinfo[i].blocks_per_page = 1;
16571 +       }
16572 +
16573 +       for (i = 0; i < pages_to_get; i++) {
16574 +               swp_entry_t entry;
16575 +               unsigned long new_value;
16576 +               unsigned swapfilenum;
16577 +
16578 +               entry = get_swap_page();
16579 +               if (!entry.val)
16580 +                       break;
16581 +
16582 +               swapfilenum = swp_type(entry);
16583 +               new_value = swap_entry_to_extent_val(entry);
16584 +
16585 +               if (!to_add[swapfilenum]) {
16586 +                       to_add[swapfilenum] = 1;
16587 +                       extent_min[swapfilenum] = new_value;
16588 +                       extent_max[swapfilenum] = new_value;
16589 +                       gotten++;
16590 +                       continue;
16591 +               }
16592 +
16593 +               if (new_value == extent_max[swapfilenum] + 1) {
16594 +                       extent_max[swapfilenum]++;
16595 +                       gotten++;
16596 +                       continue;
16597 +               }
16598 +
16599 +               if (toi_add_to_extent_chain(&swapextents,
16600 +                                       extent_min[swapfilenum],
16601 +                                       extent_max[swapfilenum])) {
16602 +                       printk(KERN_INFO "Failed to allocate extent for "
16603 +                                       "%lu-%lu.\n", extent_min[swapfilenum],
16604 +                                       extent_max[swapfilenum]);
16605 +                       free_swap_range(extent_min[swapfilenum],
16606 +                                       extent_max[swapfilenum]);
16607 +                       swap_free(entry);
16608 +                       gotten -= (extent_max[swapfilenum] -
16609 +                                       extent_min[swapfilenum] + 1);
16610 +                       /* Don't try to add again below */
16611 +                       to_add[swapfilenum] = 0;
16612 +                       break;
16613 +               } else {
16614 +                       extent_min[swapfilenum] = new_value;
16615 +                       extent_max[swapfilenum] = new_value;
16616 +                       gotten++;
16617 +               }
16618 +       }
16619 +
16620 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16621 +               if (!to_add[i] || !toi_add_to_extent_chain(&swapextents,
16622 +                                       extent_min[i], extent_max[i]))
16623 +                       continue;
16624 +
16625 +               free_swap_range(extent_min[i], extent_max[i]);
16626 +               gotten -= (extent_max[i] - extent_min[i] + 1);
16627 +               break;
16628 +       }
16629 +
16630 +       if (gotten < pages_to_get)
16631 +               result = -ENOSPC;
16632 +
16633 +       swap_pages_allocated += (long) gotten;
16634 +
16635 +       return result ? result : get_main_pool_phys_params();
16636 +}
16637 +
16638 +static int toi_swap_write_header_init(void)
16639 +{
16640 +       int i, result;
16641 +       struct swap_info_struct *si;
16642 +
16643 +       toi_bio_ops.rw_init(WRITE, 0);
16644 +       toi_writer_buffer_posn = 0;
16645 +
16646 +       /* Info needed to bootstrap goes at the start of the header.
16647 +        * First we save the positions and devinfo, including the number
16648 +        * of header pages. Then we save the structs containing data needed
16649 +        * for reading the header pages back.
16650 +        * Note that even if header pages take more than one page, when we
16651 +        * read back the info, we will have restored the location of the
16652 +        * next header page by the time we go to use it.
16653 +        */
16654 +
16655 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
16656 +                       (char *) &no_image_signature_contents,
16657 +                       sizeof(struct tuxonice_sig_data));
16658 +
16659 +       if (result)
16660 +               return result;
16661 +
16662 +       /* Forward one page will be done prior to the read */
16663 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16664 +               si = get_swap_info_struct(i);
16665 +               if (si->swap_file)
16666 +                       devinfo[i].dev_t = si->bdev->bd_dev;
16667 +               else
16668 +                       devinfo[i].dev_t = (dev_t) 0;
16669 +       }
16670 +
16671 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
16672 +                       (char *) &toi_writer_posn_save,
16673 +                       sizeof(toi_writer_posn_save));
16674 +
16675 +       if (result)
16676 +               return result;
16677 +
16678 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
16679 +                       (char *) &devinfo, sizeof(devinfo));
16680 +
16681 +       if (result)
16682 +               return result;
16683 +
16684 +       for (i = 0; i < MAX_SWAPFILES; i++)
16685 +               toi_serialise_extent_chain(&toi_swapops, &block_chain[i]);
16686 +
16687 +       return 0;
16688 +}
16689 +
16690 +static int toi_swap_write_header_cleanup(void)
16691 +{
16692 +       /* Write any unsaved data */
16693 +       if (toi_writer_buffer_posn)
16694 +               toi_bio_ops.write_header_chunk_finish();
16695 +
16696 +       toi_bio_ops.finish_all_io();
16697 +
16698 +       /* Set signature to save we have an image */
16699 +       return write_modified_signature(IMAGE_SIGNATURE);
16700 +}
16701 +
16702 +/* ------------------------- HEADER READING ------------------------- */
16703 +
16704 +/*
16705 + * read_header_init()
16706 + *
16707 + * Description:
16708 + * 1. Attempt to read the device specified with resume=.
16709 + * 2. Check the contents of the swap header for our signature.
16710 + * 3. Warn, ignore, reset and/or continue as appropriate.
16711 + * 4. If continuing, read the toi_swap configuration section
16712 + *    of the header and set up block device info so we can read
16713 + *    the rest of the header & image.
16714 + *
16715 + * Returns:
16716 + * May not return if user choose to reboot at a warning.
16717 + * -EINVAL if cannot resume at this time. Booting should continue
16718 + * normally.
16719 + */
16720 +
16721 +static int toi_swap_read_header_init(void)
16722 +{
16723 +       int i, result = 0;
16724 +       toi_writer_buffer_posn = 0;
16725 +
16726 +       if (!header_dev_t) {
16727 +               printk(KERN_INFO "read_header_init called when we haven't "
16728 +                               "verified there is an image!\n");
16729 +               return -EINVAL;
16730 +       }
16731 +
16732 +       /*
16733 +        * If the header is not on the resume_swap_dev_t, get the resume device
16734 +        * first.
16735 +        */
16736 +       if (header_dev_t != resume_swap_dev_t) {
16737 +               header_block_device = open_bdev(MAX_SWAPFILES + 1,
16738 +                               header_dev_t, 1);
16739 +
16740 +               if (IS_ERR(header_block_device))
16741 +                       return PTR_ERR(header_block_device);
16742 +       } else
16743 +               header_block_device = resume_block_device;
16744 +
16745 +       toi_bio_ops.read_header_init();
16746 +
16747 +       /*
16748 +        * Read toi_swap configuration.
16749 +        * Headerblock size taken into account already.
16750 +        */
16751 +       result = toi_bio_ops.bdev_page_io(READ, header_block_device,
16752 +                       headerblock << 3,
16753 +                       virt_to_page((unsigned long) toi_writer_buffer));
16754 +       if (result)
16755 +               return result;
16756 +
16757 +       memcpy(&no_image_signature_contents, toi_writer_buffer,
16758 +                       sizeof(no_image_signature_contents));
16759 +
16760 +       toi_writer_buffer_posn = sizeof(no_image_signature_contents);
16761 +
16762 +       memcpy(&toi_writer_posn_save, toi_writer_buffer + toi_writer_buffer_posn,
16763 +                       sizeof(toi_writer_posn_save));
16764 +
16765 +       toi_writer_buffer_posn += sizeof(toi_writer_posn_save);
16766 +
16767 +       memcpy(&devinfo, toi_writer_buffer + toi_writer_buffer_posn,
16768 +                       sizeof(devinfo));
16769 +
16770 +       toi_writer_buffer_posn += sizeof(devinfo);
16771 +
16772 +       /* Restore device info */
16773 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16774 +               dev_t thisdevice = devinfo[i].dev_t;
16775 +               struct block_device *result;
16776 +
16777 +               devinfo[i].bdev = NULL;
16778 +
16779 +               if (!thisdevice)
16780 +                       continue;
16781 +
16782 +               if (thisdevice == resume_swap_dev_t) {
16783 +                       devinfo[i].bdev = resume_block_device;
16784 +                       continue;
16785 +               }
16786 +
16787 +               if (thisdevice == header_dev_t) {
16788 +                       devinfo[i].bdev = header_block_device;
16789 +                       continue;
16790 +               }
16791 +
16792 +               result = open_bdev(i, thisdevice, 1);
16793 +               if (IS_ERR(result))
16794 +                       return PTR_ERR(result);
16795 +               devinfo[i].bdev = bdevs_opened[i]->bdev;
16796 +       }
16797 +
16798 +       toi_extent_state_goto_start(&toi_writer_posn);
16799 +       toi_bio_ops.set_extra_page_forward();
16800 +
16801 +       for (i = 0; i < MAX_SWAPFILES && !result; i++)
16802 +               result = toi_load_extent_chain(&block_chain[i]);
16803 +
16804 +       return result;
16805 +}
16806 +
16807 +static int toi_swap_read_header_cleanup(void)
16808 +{
16809 +       toi_bio_ops.rw_cleanup(READ);
16810 +       return 0;
16811 +}
16812 +
16813 +/*
16814 + * workspace_size
16815 + *
16816 + * Description:
16817 + * Returns the number of bytes of RAM needed for this
16818 + * code to do its work. (Used when calculating whether
16819 + * we have enough memory to be able to hibernate & resume).
16820 + *
16821 + */
16822 +static int toi_swap_memory_needed(void)
16823 +{
16824 +       return 1;
16825 +}
16826 +
16827 +/*
16828 + * Print debug info
16829 + *
16830 + * Description:
16831 + */
16832 +static int toi_swap_print_debug_stats(char *buffer, int size)
16833 +{
16834 +       int len = 0;
16835 +       struct sysinfo sysinfo;
16836 +
16837 +       if (toiActiveAllocator != &toi_swapops) {
16838 +               len = snprintf_used(buffer, size,
16839 +                               "- SwapAllocator inactive.\n");
16840 +               return len;
16841 +       }
16842 +
16843 +       len = snprintf_used(buffer, size, "- SwapAllocator active.\n");
16844 +       if (swapfilename[0])
16845 +               len += snprintf_used(buffer+len, size-len,
16846 +                       "  Attempting to automatically swapon: %s.\n",
16847 +                       swapfilename);
16848 +
16849 +       si_swapinfo(&sysinfo);
16850 +
16851 +       len += snprintf_used(buffer+len, size-len,
16852 +                       "  Swap available for image: %ld pages.\n",
16853 +                       (int) sysinfo.freeswap + toi_swap_storage_allocated());
16854 +
16855 +       return len;
16856 +}
16857 +
16858 +/*
16859 + * Storage needed
16860 + *
16861 + * Returns amount of space in the swap header required
16862 + * for the toi_swap's data. This ignores the links between
16863 + * pages, which we factor in when allocating the space.
16864 + *
16865 + * We ensure the space is allocated, but actually save the
16866 + * data from write_header_init and therefore don't also define a
16867 + * save_config_info routine.
16868 + */
16869 +static int toi_swap_storage_needed(void)
16870 +{
16871 +       int i, result;
16872 +       result = sizeof(toi_writer_posn_save) + sizeof(devinfo);
16873 +
16874 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16875 +               result += 3 * sizeof(int);
16876 +               result += (2 * sizeof(unsigned long) *
16877 +                       block_chain[i].num_extents);
16878 +       }
16879 +
16880 +       return result;
16881 +}
16882 +
16883 +/*
16884 + * Image_exists
16885 + *
16886 + * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
16887 + */
16888 +static int toi_swap_image_exists(int quiet)
16889 +{
16890 +       int signature_found;
16891 +
16892 +       if (!resume_swap_dev_t) {
16893 +               if (!quiet)
16894 +                       printk(KERN_INFO "Not even trying to read header "
16895 +                               "because resume_swap_dev_t is not set.\n");
16896 +               return -1;
16897 +       }
16898 +
16899 +       if (!resume_block_device &&
16900 +           IS_ERR(resume_block_device =
16901 +                       open_bdev(MAX_SWAPFILES, resume_swap_dev_t, 1))) {
16902 +               if (!quiet)
16903 +                       printk(KERN_INFO "Failed to open resume dev_t (%x).\n",
16904 +                               resume_swap_dev_t);
16905 +               return -1;
16906 +       }
16907 +
16908 +       signature_found = parse_signature();
16909 +
16910 +       switch (signature_found) {
16911 +       case -ENOMEM:
16912 +               return -1;
16913 +       case -1:
16914 +               if (!quiet)
16915 +                       printk(KERN_ERR "TuxOnIce: Unable to find a signature."
16916 +                               " Could you have moved a swap file?\n");
16917 +               return -1;
16918 +       case 0:
16919 +       case 1:
16920 +               if (!quiet)
16921 +                       printk(KERN_INFO "TuxOnIce: Normal swapspace found.\n");
16922 +               return 0;
16923 +       case 2:
16924 +       case 3:
16925 +       case 4:
16926 +               if (!quiet)
16927 +                       printk(KERN_INFO "TuxOnIce: Detected another "
16928 +                               "implementation's signature.\n");
16929 +               return 0;
16930 +       case 10:
16931 +               if (!quiet)
16932 +                       printk(KERN_INFO "TuxOnIce: Detected TuxOnIce binary "
16933 +                               "signature.\n");
16934 +               return 1;
16935 +       }
16936 +
16937 +       BUG();
16938 +       return 0;
16939 +}
16940 +
16941 +/* toi_swap_remove_image
16942 + *
16943 + */
16944 +static int toi_swap_remove_image(void)
16945 +{
16946 +       /*
16947 +        * If nr_hibernates == 0, we must be booting, so no swap pages
16948 +        * will be recorded as used yet.
16949 +        */
16950 +
16951 +       if (nr_hibernates)
16952 +               toi_swap_release_storage();
16953 +
16954 +       /*
16955 +        * We don't do a sanity check here: we want to restore the swap
16956 +        * whatever version of kernel made the hibernate image.
16957 +        *
16958 +        * We need to write swap, but swap may not be enabled so
16959 +        * we write the device directly
16960 +        *
16961 +        * If we don't have an current_signature_page, we didn't
16962 +        * read an image header, so don't change anything.
16963 +        */
16964 +
16965 +       return toi_swap_image_exists(1) ?
16966 +               write_modified_signature(NO_IMAGE_SIGNATURE) : 0;
16967 +}
16968 +
16969 +/*
16970 + * Mark resume attempted.
16971 + *
16972 + * Record that we tried to resume from this image. We have already read the
16973 + * signature in. We just need to write the modified version.
16974 + */
16975 +static int toi_swap_mark_resume_attempted(int mark)
16976 +{
16977 +       if (!resume_swap_dev_t) {
16978 +               printk(KERN_INFO "Not even trying to record attempt at resuming"
16979 +                               " because resume_swap_dev_t is not set.\n");
16980 +               return -ENODEV;
16981 +       }
16982 +
16983 +       return write_modified_signature(mark ? TRIED_RESUME : NO_TRIED_RESUME);
16984 +}
16985 +
16986 +/*
16987 + * Parse Image Location
16988 + *
16989 + * Attempt to parse a resume= parameter.
16990 + * Swap Writer accepts:
16991 + * resume=swap:DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
16992 + *
16993 + * Where:
16994 + * DEVNAME is convertable to a dev_t by name_to_dev_t
16995 + * FIRSTBLOCK is the location of the first block in the swap file
16996 + * (specifying for a swap partition is nonsensical but not prohibited).
16997 + * Data is validated by attempting to read a swap header from the
16998 + * location given. Failure will result in toi_swap refusing to
16999 + * save an image, and a reboot with correct parameters will be
17000 + * necessary.
17001 + */
17002 +static int toi_swap_parse_sig_location(char *commandline,
17003 +               int only_allocator, int quiet)
17004 +{
17005 +       char *thischar, *devstart, *colon = NULL;
17006 +       int signature_found, result = -EINVAL, temp_result;
17007 +
17008 +       if (strncmp(commandline, "swap:", 5)) {
17009 +               /*
17010 +                * Failing swap:, we'll take a simple
17011 +                * resume=/dev/hda2, but fall through to
17012 +                * other allocators if /dev/ isn't matched.
17013 +                */
17014 +               if (strncmp(commandline, "/dev/", 5))
17015 +                       return 1;
17016 +       } else
17017 +               commandline += 5;
17018 +
17019 +       devstart = thischar = commandline;
17020 +       while ((*thischar != ':') && (*thischar != '@') &&
17021 +               ((thischar - commandline) < 250) && (*thischar))
17022 +               thischar++;
17023 +
17024 +       if (*thischar == ':') {
17025 +               colon = thischar;
17026 +               *colon = 0;
17027 +               thischar++;
17028 +       }
17029 +
17030 +       while ((thischar - commandline) < 250 && *thischar)
17031 +               thischar++;
17032 +
17033 +       if (colon)
17034 +               resume_firstblock = (int) simple_strtoul(colon + 1, NULL, 0);
17035 +       else
17036 +               resume_firstblock = 0;
17037 +
17038 +       clear_toi_state(TOI_CAN_HIBERNATE);
17039 +       clear_toi_state(TOI_CAN_RESUME);
17040 +
17041 +       temp_result = try_to_parse_resume_device(devstart, quiet);
17042 +
17043 +       if (colon)
17044 +               *colon = ':';
17045 +
17046 +       if (temp_result)
17047 +               return -EINVAL;
17048 +
17049 +       signature_found = toi_swap_image_exists(quiet);
17050 +
17051 +       if (signature_found != -1) {
17052 +               result = 0;
17053 +
17054 +               toi_bio_ops.set_devinfo(devinfo);
17055 +               toi_writer_posn.chains = &block_chain[0];
17056 +               toi_writer_posn.num_chains = MAX_SWAPFILES;
17057 +               set_toi_state(TOI_CAN_HIBERNATE);
17058 +               set_toi_state(TOI_CAN_RESUME);
17059 +       } else
17060 +               if (!quiet)
17061 +                       printk(KERN_ERR "TuxOnIce: SwapAllocator: No swap "
17062 +                               "signature found at %s.\n", devstart);
17063 +       return result;
17064 +}
17065 +
17066 +static int header_locations_read_sysfs(const char *page, int count)
17067 +{
17068 +       int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
17069 +       struct inode *swapf = 0;
17070 +       int zone;
17071 +       char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
17072 +       char *path, *output = (char *) page;
17073 +       int path_len;
17074 +
17075 +       if (!page)
17076 +               return 0;
17077 +
17078 +       for (i = 0; i < MAX_SWAPFILES; i++) {
17079 +               struct swap_info_struct *si =  get_swap_info_struct(i);
17080 +
17081 +               if (!si->swap_file)
17082 +                       continue;
17083 +
17084 +               if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
17085 +                       haveswap = 1;
17086 +                       if (!printedpartitionsmessage) {
17087 +                               len += sprintf(output + len,
17088 +                                       "For swap partitions, simply use the "
17089 +                                       "format: resume=swap:/dev/hda1.\n");
17090 +                               printedpartitionsmessage = 1;
17091 +                       }
17092 +               } else {
17093 +                       path_len = 0;
17094 +
17095 +                       path = d_path(&si->swap_file->f_path, path_page,
17096 +                                       PAGE_SIZE);
17097 +                       path_len = snprintf(path_page, 31, "%s", path);
17098 +
17099 +                       haveswap = 1;
17100 +                       swapf = si->swap_file->f_mapping->host;
17101 +                       zone = bmap(swapf, 0);
17102 +                       if (!zone) {
17103 +                               len += sprintf(output + len,
17104 +                                       "Swapfile %s has been corrupted. Reuse"
17105 +                                       " mkswap on it and try again.\n",
17106 +                                       path_page);
17107 +                       } else {
17108 +                               char name_buffer[255];
17109 +                               len += sprintf(output + len,
17110 +                                       "For swapfile `%s`,"
17111 +                                       " use resume=swap:/dev/%s:0x%x.\n",
17112 +                                       path_page,
17113 +                                       bdevname(si->bdev, name_buffer),
17114 +                                       zone << (swapf->i_blkbits - 9));
17115 +                       }
17116 +               }
17117 +       }
17118 +
17119 +       if (!haveswap)
17120 +               len = sprintf(output, "You need to turn on swap partitions "
17121 +                               "before examining this file.\n");
17122 +
17123 +       toi_free_page(10, (unsigned long) path_page);
17124 +       return len;
17125 +}
17126 +
17127 +static struct toi_sysfs_data sysfs_params[] = {
17128 +       {
17129 +        TOI_ATTR("swapfilename", SYSFS_RW),
17130 +        SYSFS_STRING(swapfilename, 255, 0)
17131 +       },
17132 +
17133 +       {
17134 +        TOI_ATTR("headerlocations", SYSFS_READONLY),
17135 +        SYSFS_CUSTOM(header_locations_read_sysfs, NULL, 0)
17136 +       },
17137 +
17138 +       { TOI_ATTR("enabled", SYSFS_RW),
17139 +         SYSFS_INT(&toi_swapops.enabled, 0, 1, 0),
17140 +         .write_side_effect            = attempt_to_parse_resume_device2,
17141 +       }
17142 +};
17143 +
17144 +static struct toi_module_ops toi_swapops = {
17145 +       .type                                   = WRITER_MODULE,
17146 +       .name                                   = "swap storage",
17147 +       .directory                              = "swap",
17148 +       .module                                 = THIS_MODULE,
17149 +       .memory_needed                          = toi_swap_memory_needed,
17150 +       .print_debug_info                       = toi_swap_print_debug_stats,
17151 +       .storage_needed                         = toi_swap_storage_needed,
17152 +       .initialise                             = toi_swap_initialise,
17153 +       .cleanup                                = toi_swap_cleanup,
17154 +
17155 +       .noresume_reset         = toi_swap_noresume_reset,
17156 +       .storage_available      = toi_swap_storage_available,
17157 +       .storage_allocated      = toi_swap_storage_allocated,
17158 +       .release_storage        = toi_swap_release_storage,
17159 +       .reserve_header_space   = toi_swap_reserve_header_space,
17160 +       .allocate_storage       = toi_swap_allocate_storage,
17161 +       .image_exists           = toi_swap_image_exists,
17162 +       .mark_resume_attempted  = toi_swap_mark_resume_attempted,
17163 +       .write_header_init      = toi_swap_write_header_init,
17164 +       .write_header_cleanup   = toi_swap_write_header_cleanup,
17165 +       .read_header_init       = toi_swap_read_header_init,
17166 +       .read_header_cleanup    = toi_swap_read_header_cleanup,
17167 +       .remove_image           = toi_swap_remove_image,
17168 +       .parse_sig_location     = toi_swap_parse_sig_location,
17169 +
17170 +       .sysfs_data             = sysfs_params,
17171 +       .num_sysfs_entries      = sizeof(sysfs_params) /
17172 +               sizeof(struct toi_sysfs_data),
17173 +};
17174 +
17175 +/* ---- Registration ---- */
17176 +static __init int toi_swap_load(void)
17177 +{
17178 +       toi_swapops.rw_init = toi_bio_ops.rw_init;
17179 +       toi_swapops.rw_cleanup = toi_bio_ops.rw_cleanup;
17180 +       toi_swapops.read_page = toi_bio_ops.read_page;
17181 +       toi_swapops.write_page = toi_bio_ops.write_page;
17182 +       toi_swapops.rw_header_chunk = toi_bio_ops.rw_header_chunk;
17183 +       toi_swapops.rw_header_chunk_noreadahead =
17184 +               toi_bio_ops.rw_header_chunk_noreadahead;
17185 +       toi_swapops.io_flusher = toi_bio_ops.io_flusher;
17186 +
17187 +       return toi_register_module(&toi_swapops);
17188 +}
17189 +
17190 +#ifdef MODULE
17191 +static __exit void toi_swap_unload(void)
17192 +{
17193 +       toi_unregister_module(&toi_swapops);
17194 +}
17195 +
17196 +module_init(toi_swap_load);
17197 +module_exit(toi_swap_unload);
17198 +MODULE_LICENSE("GPL");
17199 +MODULE_AUTHOR("Nigel Cunningham");
17200 +MODULE_DESCRIPTION("TuxOnIce SwapAllocator");
17201 +#else
17202 +late_initcall(toi_swap_load);
17203 +#endif
17204 diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
17205 new file mode 100644
17206 index 0000000..f5ab465
17207 --- /dev/null
17208 +++ b/kernel/power/tuxonice_sysfs.c
17209 @@ -0,0 +1,337 @@
17210 +/*
17211 + * kernel/power/tuxonice_sysfs.c
17212 + *
17213 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
17214 + *
17215 + * This file is released under the GPLv2.
17216 + *
17217 + * This file contains support for sysfs entries for tuning TuxOnIce.
17218 + *
17219 + * We have a generic handler that deals with the most common cases, and
17220 + * hooks for special handlers to use.
17221 + */
17222 +
17223 +#include <linux/suspend.h>
17224 +#include <linux/module.h>
17225 +#include <asm/uaccess.h>
17226 +
17227 +#include "tuxonice_sysfs.h"
17228 +#include "tuxonice.h"
17229 +#include "tuxonice_storage.h"
17230 +#include "tuxonice_alloc.h"
17231 +
17232 +static int toi_sysfs_initialised;
17233 +
17234 +static void toi_initialise_sysfs(void);
17235 +
17236 +static struct toi_sysfs_data sysfs_params[];
17237 +
17238 +#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
17239 +
17240 +static void toi_main_wrapper(void)
17241 +{
17242 +       _toi_try_hibernate(0);
17243 +}
17244 +
17245 +static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
17246 +                             char *page)
17247 +{
17248 +       struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
17249 +       int len = 0;
17250 +
17251 +       if (toi_start_anything(0))
17252 +               return -EBUSY;
17253 +
17254 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
17255 +               toi_prepare_usm();
17256 +
17257 +       switch (sysfs_data->type) {
17258 +       case TOI_SYSFS_DATA_CUSTOM:
17259 +               len = (sysfs_data->data.special.read_sysfs) ?
17260 +                       (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
17261 +                       : 0;
17262 +               break;
17263 +       case TOI_SYSFS_DATA_BIT:
17264 +               len = sprintf(page, "%d\n",
17265 +                       -test_bit(sysfs_data->data.bit.bit,
17266 +                               sysfs_data->data.bit.bit_vector));
17267 +               break;
17268 +       case TOI_SYSFS_DATA_INTEGER:
17269 +               len = sprintf(page, "%d\n",
17270 +                       *(sysfs_data->data.integer.variable));
17271 +               break;
17272 +       case TOI_SYSFS_DATA_LONG:
17273 +               len = sprintf(page, "%ld\n",
17274 +                       *(sysfs_data->data.a_long.variable));
17275 +               break;
17276 +       case TOI_SYSFS_DATA_UL:
17277 +               len = sprintf(page, "%lu\n",
17278 +                       *(sysfs_data->data.ul.variable));
17279 +               break;
17280 +       case TOI_SYSFS_DATA_STRING:
17281 +               len = sprintf(page, "%s\n",
17282 +                       sysfs_data->data.string.variable);
17283 +               break;
17284 +       }
17285 +       /* Side effect routine? */
17286 +       if (sysfs_data->read_side_effect)
17287 +               sysfs_data->read_side_effect();
17288 +
17289 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
17290 +               toi_cleanup_usm();
17291 +
17292 +       toi_finish_anything(0);
17293 +
17294 +       return len;
17295 +}
17296 +
17297 +#define BOUND(_variable, _type) \
17298 +       do { \
17299 +       if (*_variable < sysfs_data->data._type.minimum) \
17300 +               *_variable = sysfs_data->data._type.minimum; \
17301 +       else if (*_variable > sysfs_data->data._type.maximum) \
17302 +               *_variable = sysfs_data->data._type.maximum; \
17303 +       } while (0)
17304 +
17305 +static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
17306 +               const char *my_buf, size_t count)
17307 +{
17308 +       int assigned_temp_buffer = 0, result = count;
17309 +       struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
17310 +
17311 +       if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
17312 +               return -EBUSY;
17313 +
17314 +       ((char *) my_buf)[count] = 0;
17315 +
17316 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
17317 +               toi_prepare_usm();
17318 +
17319 +       switch (sysfs_data->type) {
17320 +       case TOI_SYSFS_DATA_CUSTOM:
17321 +               if (sysfs_data->data.special.write_sysfs)
17322 +                       result = (sysfs_data->data.special.write_sysfs)
17323 +                               (my_buf, count);
17324 +               break;
17325 +       case TOI_SYSFS_DATA_BIT:
17326 +               {
17327 +               int value = simple_strtoul(my_buf, NULL, 0);
17328 +               if (value)
17329 +                       set_bit(sysfs_data->data.bit.bit,
17330 +                               (sysfs_data->data.bit.bit_vector));
17331 +               else
17332 +                       clear_bit(sysfs_data->data.bit.bit,
17333 +                               (sysfs_data->data.bit.bit_vector));
17334 +               }
17335 +               break;
17336 +       case TOI_SYSFS_DATA_INTEGER:
17337 +               {
17338 +                       int *variable =
17339 +                               sysfs_data->data.integer.variable;
17340 +                       *variable = simple_strtol(my_buf, NULL, 0);
17341 +                       BOUND(variable, integer);
17342 +                       break;
17343 +               }
17344 +       case TOI_SYSFS_DATA_LONG:
17345 +               {
17346 +                       long *variable =
17347 +                               sysfs_data->data.a_long.variable;
17348 +                       *variable = simple_strtol(my_buf, NULL, 0);
17349 +                       BOUND(variable, a_long);
17350 +                       break;
17351 +               }
17352 +       case TOI_SYSFS_DATA_UL:
17353 +               {
17354 +                       unsigned long *variable =
17355 +                               sysfs_data->data.ul.variable;
17356 +                       *variable = simple_strtoul(my_buf, NULL, 0);
17357 +                       BOUND(variable, ul);
17358 +                       break;
17359 +               }
17360 +               break;
17361 +       case TOI_SYSFS_DATA_STRING:
17362 +               {
17363 +                       int copy_len = count;
17364 +                       char *variable =
17365 +                               sysfs_data->data.string.variable;
17366 +
17367 +                       if (sysfs_data->data.string.max_length &&
17368 +                           (copy_len > sysfs_data->data.string.max_length))
17369 +                               copy_len = sysfs_data->data.string.max_length;
17370 +
17371 +                       if (!variable) {
17372 +                               variable = (char *) toi_get_zeroed_page(31,
17373 +                                               TOI_ATOMIC_GFP);
17374 +                               sysfs_data->data.string.variable = variable;
17375 +                               assigned_temp_buffer = 1;
17376 +                       }
17377 +                       strncpy(variable, my_buf, copy_len);
17378 +                       if ((copy_len) &&
17379 +                                (my_buf[copy_len - 1] == '\n'))
17380 +                               variable[count - 1] = 0;
17381 +                       variable[count] = 0;
17382 +               }
17383 +               break;
17384 +       }
17385 +
17386 +       /* Side effect routine? */
17387 +       if (sysfs_data->write_side_effect)
17388 +               sysfs_data->write_side_effect();
17389 +
17390 +       /* Free temporary buffers */
17391 +       if (assigned_temp_buffer) {
17392 +               toi_free_page(31,
17393 +                       (unsigned long) sysfs_data->data.string.variable);
17394 +               sysfs_data->data.string.variable = NULL;
17395 +       }
17396 +
17397 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
17398 +               toi_cleanup_usm();
17399 +
17400 +       toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
17401 +
17402 +       return result;
17403 +}
17404 +
17405 +static struct sysfs_ops toi_sysfs_ops = {
17406 +       .show   = &toi_attr_show,
17407 +       .store  = &toi_attr_store,
17408 +};
17409 +
17410 +static struct kobj_type toi_ktype = {
17411 +       .sysfs_ops      = &toi_sysfs_ops,
17412 +};
17413 +
17414 +struct kobject *tuxonice_kobj;
17415 +
17416 +/* Non-module sysfs entries.
17417 + *
17418 + * This array contains entries that are automatically registered at
17419 + * boot. Modules and the console code register their own entries separately.
17420 + *
17421 + * NB: If you move do_hibernate, change toi_write_sysfs's test so that
17422 + * toi_start_anything still gets a 1 when the user echos > do_hibernate!
17423 + */
17424 +
17425 +static struct toi_sysfs_data sysfs_params[] = {
17426 +       { TOI_ATTR("do_hibernate", SYSFS_WRITEONLY),
17427 +         SYSFS_CUSTOM(NULL, NULL, SYSFS_HIBERNATING),
17428 +         .write_side_effect = toi_main_wrapper
17429 +       },
17430 +
17431 +       { TOI_ATTR("do_resume", SYSFS_WRITEONLY),
17432 +         SYSFS_CUSTOM(NULL, NULL, SYSFS_RESUMING),
17433 +         .write_side_effect = __toi_try_resume
17434 +       },
17435 +
17436 +};
17437 +
17438 +void remove_toi_sysdir(struct kobject *kobj)
17439 +{
17440 +       if (!kobj)
17441 +               return;
17442 +
17443 +       kobject_put(kobj);
17444 +}
17445 +
17446 +struct kobject *make_toi_sysdir(char *name)
17447 +{
17448 +       struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
17449 +
17450 +       if (!kobj) {
17451 +               printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
17452 +                               "dir!\n");
17453 +               return NULL;
17454 +       }
17455 +
17456 +       kobj->ktype = &toi_ktype;
17457 +
17458 +       return kobj;
17459 +}
17460 +
17461 +/* toi_register_sysfs_file
17462 + *
17463 + * Helper for registering a new /sysfs/tuxonice entry.
17464 + */
17465 +
17466 +int toi_register_sysfs_file(
17467 +               struct kobject *kobj,
17468 +               struct toi_sysfs_data *toi_sysfs_data)
17469 +{
17470 +       int result;
17471 +
17472 +       if (!toi_sysfs_initialised)
17473 +               toi_initialise_sysfs();
17474 +
17475 +       result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
17476 +       if (result)
17477 +               printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
17478 +                       "returned %d.\n",
17479 +                       toi_sysfs_data->attr.name, result);
17480 +       kobj->ktype = &toi_ktype;
17481 +
17482 +       return result;
17483 +}
17484 +EXPORT_SYMBOL_GPL(toi_register_sysfs_file);
17485 +
17486 +/* toi_unregister_sysfs_file
17487 + *
17488 + * Helper for removing unwanted /sys/power/tuxonice entries.
17489 + *
17490 + */
17491 +void toi_unregister_sysfs_file(struct kobject *kobj,
17492 +               struct toi_sysfs_data *toi_sysfs_data)
17493 +{
17494 +       sysfs_remove_file(kobj, &toi_sysfs_data->attr);
17495 +}
17496 +EXPORT_SYMBOL_GPL(toi_unregister_sysfs_file);
17497 +
17498 +void toi_cleanup_sysfs(void)
17499 +{
17500 +       int i,
17501 +           numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
17502 +
17503 +       if (!toi_sysfs_initialised)
17504 +               return;
17505 +
17506 +       for (i = 0; i < numfiles; i++)
17507 +               toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
17508 +
17509 +       kobject_put(tuxonice_kobj);
17510 +       toi_sysfs_initialised = 0;
17511 +}
17512 +
17513 +/* toi_initialise_sysfs
17514 + *
17515 + * Initialise the /sysfs/tuxonice directory.
17516 + */
17517 +
17518 +static void toi_initialise_sysfs(void)
17519 +{
17520 +       int i;
17521 +       int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
17522 +
17523 +       if (toi_sysfs_initialised)
17524 +               return;
17525 +
17526 +       /* Make our TuxOnIce directory a child of /sys/power */
17527 +       tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
17528 +       if (!tuxonice_kobj)
17529 +               return;
17530 +
17531 +       toi_sysfs_initialised = 1;
17532 +
17533 +       for (i = 0; i < numfiles; i++)
17534 +               toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
17535 +}
17536 +
17537 +int toi_sysfs_init(void)
17538 +{
17539 +       toi_initialise_sysfs();
17540 +       return 0;
17541 +}
17542 +
17543 +void toi_sysfs_exit(void)
17544 +{
17545 +       toi_cleanup_sysfs();
17546 +}
17547 diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
17548 new file mode 100644
17549 index 0000000..bf56414
17550 --- /dev/null
17551 +++ b/kernel/power/tuxonice_sysfs.h
17552 @@ -0,0 +1,127 @@
17553 +/*
17554 + * kernel/power/tuxonice_sysfs.h
17555 + *
17556 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
17557 + *
17558 + * This file is released under the GPLv2.
17559 + */
17560 +
17561 +#include <linux/sysfs.h>
17562 +#include "power.h"
17563 +
17564 +struct toi_sysfs_data {
17565 +       struct attribute attr;
17566 +       int type;
17567 +       int flags;
17568 +       union {
17569 +               struct {
17570 +                       unsigned long *bit_vector;
17571 +                       int bit;
17572 +               } bit;
17573 +               struct {
17574 +                       int *variable;
17575 +                       int minimum;
17576 +                       int maximum;
17577 +               } integer;
17578 +               struct {
17579 +                       long *variable;
17580 +                       long minimum;
17581 +                       long maximum;
17582 +               } a_long;
17583 +               struct {
17584 +                       unsigned long *variable;
17585 +                       unsigned long minimum;
17586 +                       unsigned long maximum;
17587 +               } ul;
17588 +               struct {
17589 +                       char *variable;
17590 +                       int max_length;
17591 +               } string;
17592 +               struct {
17593 +                       int (*read_sysfs) (const char *buffer, int count);
17594 +                       int (*write_sysfs) (const char *buffer, int count);
17595 +                       void *data;
17596 +               } special;
17597 +       } data;
17598 +
17599 +       /* Side effects routines. Used, eg, for reparsing the
17600 +        * resume= entry when it changes */
17601 +       void (*read_side_effect) (void);
17602 +       void (*write_side_effect) (void);
17603 +       struct list_head sysfs_data_list;
17604 +};
17605 +
17606 +enum {
17607 +       TOI_SYSFS_DATA_NONE = 1,
17608 +       TOI_SYSFS_DATA_CUSTOM,
17609 +       TOI_SYSFS_DATA_BIT,
17610 +       TOI_SYSFS_DATA_INTEGER,
17611 +       TOI_SYSFS_DATA_UL,
17612 +       TOI_SYSFS_DATA_LONG,
17613 +       TOI_SYSFS_DATA_STRING
17614 +};
17615 +
17616 +#define TOI_ATTR(_name, _mode)      \
17617 +       .attr = {.name  = _name , .mode   = _mode }
17618 +
17619 +#define SYSFS_BIT(_ul, _bit, _flags) \
17620 +       .type = TOI_SYSFS_DATA_BIT, \
17621 +       .flags = _flags, \
17622 +       .data = { .bit = { .bit_vector = _ul, .bit = _bit } }
17623 +
17624 +#define SYSFS_INT(_int, _min, _max, _flags) \
17625 +       .type = TOI_SYSFS_DATA_INTEGER, \
17626 +       .flags = _flags, \
17627 +       .data = { .integer = { .variable = _int, .minimum = _min, \
17628 +                       .maximum = _max } }
17629 +
17630 +#define SYSFS_UL(_ul, _min, _max, _flags) \
17631 +       .type = TOI_SYSFS_DATA_UL, \
17632 +       .flags = _flags, \
17633 +       .data = { .ul = { .variable = _ul, .minimum = _min, \
17634 +                       .maximum = _max } }
17635 +
17636 +#define SYSFS_LONG(_long, _min, _max, _flags) \
17637 +       .type = TOI_SYSFS_DATA_LONG, \
17638 +       .flags = _flags, \
17639 +       .data = { .a_long = { .variable = _long, .minimum = _min, \
17640 +                       .maximum = _max } }
17641 +
17642 +#define SYSFS_STRING(_string, _max_len, _flags) \
17643 +       .type = TOI_SYSFS_DATA_STRING, \
17644 +       .flags = _flags, \
17645 +       .data = { .string = { .variable = _string, .max_length = _max_len } }
17646 +
17647 +#define SYSFS_CUSTOM(_read, _write, _flags) \
17648 +       .type = TOI_SYSFS_DATA_CUSTOM, \
17649 +       .flags = _flags, \
17650 +       .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }
17651 +
17652 +#define SYSFS_WRITEONLY 0200
17653 +#define SYSFS_READONLY 0444
17654 +#define SYSFS_RW 0644
17655 +
17656 +/* Flags */
17657 +#define SYSFS_NEEDS_SM_FOR_READ 1
17658 +#define SYSFS_NEEDS_SM_FOR_WRITE 2
17659 +#define SYSFS_HIBERNATE 4
17660 +#define SYSFS_RESUME 8
17661 +#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
17662 +#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
17663 +#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
17664 +#define SYSFS_NEEDS_SM_FOR_BOTH \
17665 + (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
17666 +
17667 +int toi_register_sysfs_file(struct kobject *kobj,
17668 +               struct toi_sysfs_data *toi_sysfs_data);
17669 +void toi_unregister_sysfs_file(struct kobject *kobj,
17670 +               struct toi_sysfs_data *toi_sysfs_data);
17671 +
17672 +extern struct kobject *tuxonice_kobj;
17673 +
17674 +struct kobject *make_toi_sysdir(char *name);
17675 +void remove_toi_sysdir(struct kobject *obj);
17676 +extern void toi_cleanup_sysfs(void);
17677 +
17678 +extern int toi_sysfs_init(void);
17679 +extern void toi_sysfs_exit(void);
17680 diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
17681 new file mode 100644
17682 index 0000000..125f21e
17683 --- /dev/null
17684 +++ b/kernel/power/tuxonice_ui.c
17685 @@ -0,0 +1,261 @@
17686 +/*
17687 + * kernel/power/tuxonice_ui.c
17688 + *
17689 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
17690 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
17691 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
17692 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
17693 + *
17694 + * This file is released under the GPLv2.
17695 + *
17696 + * Routines for TuxOnIce's user interface.
17697 + *
17698 + * The user interface code talks to a userspace program via a
17699 + * netlink socket.
17700 + *
17701 + * The kernel side:
17702 + * - starts the userui program;
17703 + * - sends text messages and progress bar status;
17704 + *
17705 + * The user space side:
17706 + * - passes messages regarding user requests (abort, toggle reboot etc)
17707 + *
17708 + */
17709 +
17710 +#define __KERNEL_SYSCALLS__
17711 +
17712 +#include <linux/reboot.h>
17713 +
17714 +#include "tuxonice_sysfs.h"
17715 +#include "tuxonice_modules.h"
17716 +#include "tuxonice.h"
17717 +#include "tuxonice_ui.h"
17718 +#include "tuxonice_netlink.h"
17719 +#include "tuxonice_power_off.h"
17720 +#include "tuxonice_builtin.h"
17721 +
17722 +static char local_printf_buf[1024];    /* Same as printk - should be safe */
17723 +struct ui_ops *toi_current_ui;
17724 +
17725 +/**
17726 + * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
17727 + *
17728 + * @timeout: Maximum time to wait.
17729 + *
17730 + * Wait for a keypress, either from userui or /dev/console if userui isn't
17731 + * available. The non-userui path is particularly for at boot-time, prior
17732 + * to userui being started, when we have an important warning to give to
17733 + * the user.
17734 + */
17735 +static char toi_wait_for_keypress(int timeout)
17736 +{
17737 +       if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
17738 +               return ' ';
17739 +
17740 +       return toi_wait_for_keypress_dev_console(timeout);
17741 +}
17742 +
17743 +/* toi_early_boot_message()
17744 + * Description:        Handle errors early in the process of booting.
17745 + *             The user may press C to continue booting, perhaps
17746 + *             invalidating the image,  or space to reboot.
17747 + *             This works from either the serial console or normally
17748 + *             attached keyboard.
17749 + *
17750 + *             Note that we come in here from init, while the kernel is
17751 + *             locked. If we want to get events from the serial console,
17752 + *             we need to temporarily unlock the kernel.
17753 + *
17754 + *             toi_early_boot_message may also be called post-boot.
17755 + *             In this case, it simply printks the message and returns.
17756 + *
17757 + * Arguments:  int     Whether we are able to erase the image.
17758 + *             int     default_answer. What to do when we timeout. This
17759 + *                     will normally be continue, but the user might
17760 + *                     provide command line options (__setup) to override
17761 + *                     particular cases.
17762 + *             Char *. Pointer to a string explaining why we're moaning.
17763 + */
17764 +
17765 +#define say(message, a...) printk(KERN_EMERG message, ##a)
17766 +
17767 +void toi_early_boot_message(int message_detail, int default_answer,
17768 +       char *warning_reason, ...)
17769 +{
17770 +#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
17771 +       unsigned long orig_state = get_toi_state(), continue_req = 0;
17772 +       unsigned long orig_loglevel = console_loglevel;
17773 +       int can_ask = 1;
17774 +#else
17775 +       int can_ask = 0;
17776 +#endif
17777 +
17778 +       va_list args;
17779 +       int printed_len;
17780 +
17781 +       if (!toi_wait) {
17782 +               set_toi_state(TOI_CONTINUE_REQ);
17783 +               can_ask = 0;
17784 +       }
17785 +
17786 +       if (warning_reason) {
17787 +               va_start(args, warning_reason);
17788 +               printed_len = vsnprintf(local_printf_buf,
17789 +                               sizeof(local_printf_buf),
17790 +                               warning_reason,
17791 +                               args);
17792 +               va_end(args);
17793 +       }
17794 +
17795 +       if (!test_toi_state(TOI_BOOT_TIME)) {
17796 +               printk("TuxOnIce: %s\n", local_printf_buf);
17797 +               return;
17798 +       }
17799 +
17800 +       if (!can_ask) {
17801 +               continue_req = !!default_answer;
17802 +               goto post_ask;
17803 +       }
17804 +
17805 +#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
17806 +       console_loglevel = 7;
17807 +
17808 +       say("=== TuxOnIce ===\n\n");
17809 +       if (warning_reason) {
17810 +               say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
17811 +               switch (message_detail) {
17812 +               case 0:
17813 +                       say("If you continue booting, note that any image WILL"
17814 +                               "NOT BE REMOVED.\nTuxOnIce is unable to do so "
17815 +                               "because the appropriate modules aren't\n"
17816 +                               "loaded. You should manually remove the image "
17817 +                               "to avoid any\npossibility of corrupting your "
17818 +                               "filesystem(s) later.\n");
17819 +                       break;
17820 +               case 1:
17821 +                       say("If you want to use the current TuxOnIce image, "
17822 +                               "reboot and try\nagain with the same kernel "
17823 +                               "that you hibernated from. If you want\n"
17824 +                               "to forget that image, continue and the image "
17825 +                               "will be erased.\n");
17826 +                       break;
17827 +               }
17828 +               say("Press SPACE to reboot or C to continue booting with "
17829 +                       "this kernel\n\n");
17830 +               if (toi_wait > 0)
17831 +                       say("Default action if you don't select one in %d "
17832 +                               "seconds is: %s.\n",
17833 +                               toi_wait,
17834 +                               default_answer == TOI_CONTINUE_REQ ?
17835 +                               "continue booting" : "reboot");
17836 +       } else {
17837 +               say("BIG FAT WARNING!!\n\n"
17838 +                       "You have tried to resume from this image before.\n"
17839 +                       "If it failed once, it may well fail again.\n"
17840 +                       "Would you like to remove the image and boot "
17841 +                       "normally?\nThis will be equivalent to entering "
17842 +                       "noresume on the\nkernel command line.\n\n"
17843 +                       "Press SPACE to remove the image or C to continue "
17844 +                       "resuming.\n\n");
17845 +               if (toi_wait > 0)
17846 +                       say("Default action if you don't select one in %d "
17847 +                               "seconds is: %s.\n", toi_wait,
17848 +                               !!default_answer ?
17849 +                               "continue resuming" : "remove the image");
17850 +       }
17851 +       console_loglevel = orig_loglevel;
17852 +
17853 +       set_toi_state(TOI_SANITY_CHECK_PROMPT);
17854 +       clear_toi_state(TOI_CONTINUE_REQ);
17855 +
17856 +       if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
17857 +               continue_req = !!default_answer;
17858 +       else
17859 +               continue_req = test_toi_state(TOI_CONTINUE_REQ);
17860 +
17861 +#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
17862 +
17863 +post_ask:
17864 +       if ((warning_reason) && (!continue_req))
17865 +               machine_restart(NULL);
17866 +
17867 +       restore_toi_state(orig_state);
17868 +       if (continue_req)
17869 +               set_toi_state(TOI_CONTINUE_REQ);
17870 +}
17871 +#undef say
17872 +
17873 +/*
17874 + * User interface specific /sys/power/tuxonice entries.
17875 + */
17876 +
17877 +static struct toi_sysfs_data sysfs_params[] = {
17878 +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
17879 +       { TOI_ATTR("default_console_level", SYSFS_RW),
17880 +         SYSFS_INT(&toi_bkd.toi_default_console_level, 0, 7, 0)
17881 +       },
17882 +
17883 +       { TOI_ATTR("debug_sections", SYSFS_RW),
17884 +         SYSFS_UL(&toi_bkd.toi_debug_state, 0, 1 << 30, 0)
17885 +       },
17886 +
17887 +       { TOI_ATTR("log_everything", SYSFS_RW),
17888 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_LOGALL, 0)
17889 +       },
17890 +#endif
17891 +       { TOI_ATTR("pm_prepare_console", SYSFS_RW),
17892 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_PM_PREPARE_CONSOLE, 0)
17893 +       }
17894 +};
17895 +
17896 +static struct toi_module_ops userui_ops = {
17897 +       .type                           = MISC_HIDDEN_MODULE,
17898 +       .name                           = "printk ui",
17899 +       .directory                      = "user_interface",
17900 +       .module                         = THIS_MODULE,
17901 +       .sysfs_data                     = sysfs_params,
17902 +       .num_sysfs_entries              = sizeof(sysfs_params) /
17903 +               sizeof(struct toi_sysfs_data),
17904 +};
17905 +
17906 +int toi_register_ui_ops(struct ui_ops *this_ui)
17907 +{
17908 +       if (toi_current_ui) {
17909 +               printk(KERN_INFO "Only one TuxOnIce user interface module can "
17910 +                               "be loaded at a time.");
17911 +               return -EBUSY;
17912 +       }
17913 +
17914 +       toi_current_ui = this_ui;
17915 +
17916 +       return 0;
17917 +}
17918 +
17919 +void toi_remove_ui_ops(struct ui_ops *this_ui)
17920 +{
17921 +       if (toi_current_ui != this_ui)
17922 +               return;
17923 +
17924 +       toi_current_ui = NULL;
17925 +}
17926 +
17927 +/* toi_console_sysfs_init
17928 + * Description: Boot time initialisation for user interface.
17929 + */
17930 +
17931 +int toi_ui_init(void)
17932 +{
17933 +       return toi_register_module(&userui_ops);
17934 +}
17935 +
17936 +void toi_ui_exit(void)
17937 +{
17938 +       toi_unregister_module(&userui_ops);
17939 +}
17940 +
17941 +#ifdef CONFIG_TOI_EXPORTS
17942 +EXPORT_SYMBOL_GPL(toi_current_ui);
17943 +EXPORT_SYMBOL_GPL(toi_early_boot_message);
17944 +EXPORT_SYMBOL_GPL(toi_register_ui_ops);
17945 +EXPORT_SYMBOL_GPL(toi_remove_ui_ops);
17946 +#endif
17947 diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
17948 new file mode 100644
17949 index 0000000..7825ddf
17950 --- /dev/null
17951 +++ b/kernel/power/tuxonice_ui.h
17952 @@ -0,0 +1,104 @@
17953 +/*
17954 + * kernel/power/tuxonice_ui.h
17955 + *
17956 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
17957 + */
17958 +
17959 +enum {
17960 +       DONT_CLEAR_BAR,
17961 +       CLEAR_BAR
17962 +};
17963 +
17964 +enum {
17965 +       /* Userspace -> Kernel */
17966 +       USERUI_MSG_ABORT = 0x11,
17967 +       USERUI_MSG_SET_STATE = 0x12,
17968 +       USERUI_MSG_GET_STATE = 0x13,
17969 +       USERUI_MSG_GET_DEBUG_STATE = 0x14,
17970 +       USERUI_MSG_SET_DEBUG_STATE = 0x15,
17971 +       USERUI_MSG_SPACE = 0x18,
17972 +       USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
17973 +       USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
17974 +       USERUI_MSG_GET_LOGLEVEL = 0x1C,
17975 +       USERUI_MSG_SET_LOGLEVEL = 0x1D,
17976 +       USERUI_MSG_PRINTK = 0x1E,
17977 +
17978 +       /* Kernel -> Userspace */
17979 +       USERUI_MSG_MESSAGE = 0x21,
17980 +       USERUI_MSG_PROGRESS = 0x22,
17981 +       USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
17982 +
17983 +       USERUI_MSG_MAX,
17984 +};
17985 +
17986 +struct userui_msg_params {
17987 +       unsigned long a, b, c, d;
17988 +       char text[255];
17989 +};
17990 +
17991 +struct ui_ops {
17992 +       char (*wait_for_key) (int timeout);
17993 +       unsigned long (*update_status) (unsigned long value,
17994 +               unsigned long maximum, const char *fmt, ...);
17995 +       void (*prepare_status) (int clearbar, const char *fmt, ...);
17996 +       void (*cond_pause) (int pause, char *message);
17997 +       void (*abort)(int result_code, const char *fmt, ...);
17998 +       void (*prepare)(void);
17999 +       void (*cleanup)(void);
18000 +       void (*post_atomic_restore)(void);
18001 +       void (*message)(unsigned long section, unsigned long level,
18002 +               int normally_logged, const char *fmt, ...);
18003 +};
18004 +
18005 +extern struct ui_ops *toi_current_ui;
18006 +
18007 +#define toi_update_status(val, max, fmt, args...) \
18008 + (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
18009 +       max)
18010 +
18011 +#define toi_ui_post_atomic_restore(void) \
18012 +       do { if (toi_current_ui) \
18013 +               (toi_current_ui->post_atomic_restore)(); \
18014 +       } while (0)
18015 +
18016 +#define toi_prepare_console(void) \
18017 +       do { if (toi_current_ui) \
18018 +               (toi_current_ui->prepare)(); \
18019 +       } while (0)
18020 +
18021 +#define toi_cleanup_console(void) \
18022 +       do { if (toi_current_ui) \
18023 +               (toi_current_ui->cleanup)(); \
18024 +       } while (0)
18025 +
18026 +#define abort_hibernate(result, fmt, args...) \
18027 +       do { if (toi_current_ui) \
18028 +               (toi_current_ui->abort)(result, fmt, ##args); \
18029 +            else { \
18030 +               set_abort_result(result); \
18031 +            } \
18032 +       } while (0)
18033 +
18034 +#define toi_cond_pause(pause, message) \
18035 +       do { if (toi_current_ui) \
18036 +               (toi_current_ui->cond_pause)(pause, message); \
18037 +       } while (0)
18038 +
18039 +#define toi_prepare_status(clear, fmt, args...) \
18040 +       do { if (toi_current_ui) \
18041 +               (toi_current_ui->prepare_status)(clear, fmt, ##args); \
18042 +            else \
18043 +               printk(KERN_ERR fmt "%s", ##args, "\n"); \
18044 +       } while (0)
18045 +
18046 +#define toi_message(sn, lev, log, fmt, a...) \
18047 +do { \
18048 +       if (toi_current_ui && (!sn || test_debug_state(sn))) \
18049 +               toi_current_ui->message(sn, lev, log, fmt, ##a); \
18050 +} while (0)
18051 +
18052 +__exit void toi_ui_cleanup(void);
18053 +extern int toi_ui_init(void);
18054 +extern void toi_ui_exit(void);
18055 +extern int toi_register_ui_ops(struct ui_ops *this_ui);
18056 +extern void toi_remove_ui_ops(struct ui_ops *this_ui);
18057 diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
18058 new file mode 100644
18059 index 0000000..634a801
18060 --- /dev/null
18061 +++ b/kernel/power/tuxonice_userui.c
18062 @@ -0,0 +1,674 @@
18063 +/*
18064 + * kernel/power/user_ui.c
18065 + *
18066 + * Copyright (C) 2005-2007 Bernard Blackham
18067 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
18068 + *
18069 + * This file is released under the GPLv2.
18070 + *
18071 + * Routines for TuxOnIce's user interface.
18072 + *
18073 + * The user interface code talks to a userspace program via a
18074 + * netlink socket.
18075 + *
18076 + * The kernel side:
18077 + * - starts the userui program;
18078 + * - sends text messages and progress bar status;
18079 + *
18080 + * The user space side:
18081 + * - passes messages regarding user requests (abort, toggle reboot etc)
18082 + *
18083 + */
18084 +
18085 +#define __KERNEL_SYSCALLS__
18086 +
18087 +#include <linux/suspend.h>
18088 +#include <linux/freezer.h>
18089 +#include <linux/console.h>
18090 +#include <linux/ctype.h>
18091 +#include <linux/tty.h>
18092 +#include <linux/vt_kern.h>
18093 +#include <linux/module.h>
18094 +#include <linux/reboot.h>
18095 +#include <linux/kmod.h>
18096 +#include <linux/security.h>
18097 +#include <linux/syscalls.h>
18098 +
18099 +#include "tuxonice_sysfs.h"
18100 +#include "tuxonice_modules.h"
18101 +#include "tuxonice.h"
18102 +#include "tuxonice_ui.h"
18103 +#include "tuxonice_netlink.h"
18104 +#include "tuxonice_power_off.h"
18105 +
18106 +static char local_printf_buf[1024];    /* Same as printk - should be safe */
18107 +
18108 +static struct user_helper_data ui_helper_data;
18109 +static struct toi_module_ops userui_ops;
18110 +static int orig_kmsg;
18111 +
18112 +static char lastheader[512];
18113 +static int lastheader_message_len;
18114 +static int ui_helper_changed; /* Used at resume-time so don't overwrite value
18115 +                               set from initrd/ramfs. */
18116 +
18117 +/* Number of distinct progress amounts that userspace can display */
18118 +static int progress_granularity = 30;
18119 +
18120 +static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
18121 +
18122 +/**
18123 + * ui_nl_set_state - Update toi_action based on a message from userui.
18124 + *
18125 + * @n: The bit (1 << bit) to set.
18126 + */
18127 +static void ui_nl_set_state(int n)
18128 +{
18129 +       /* Only let them change certain settings */
18130 +       static const int toi_action_mask =
18131 +               (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
18132 +               (1 << TOI_LOGALL) |
18133 +               (1 << TOI_SINGLESTEP) |
18134 +               (1 << TOI_PAUSE_NEAR_PAGESET_END);
18135 +
18136 +       toi_bkd.toi_action = (toi_bkd.toi_action & (~toi_action_mask)) |
18137 +               (n & toi_action_mask);
18138 +
18139 +       if (!test_action_state(TOI_PAUSE) &&
18140 +                       !test_action_state(TOI_SINGLESTEP))
18141 +               wake_up_interruptible(&userui_wait_for_key);
18142 +}
18143 +
18144 +/**
18145 + * userui_post_atomic_restore - Tell userui that atomic restore just happened.
18146 + *
18147 + * Tell userui that atomic restore just occured, so that it can do things like
18148 + * redrawing the screen, re-getting settings and so on.
18149 + */
18150 +static void userui_post_atomic_restore(void)
18151 +{
18152 +       toi_send_netlink_message(&ui_helper_data,
18153 +                       USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
18154 +}
18155 +
18156 +/**
18157 + * userui_storage_needed - Report how much memory in image header is needed.
18158 + */
18159 +static int userui_storage_needed(void)
18160 +{
18161 +       return sizeof(ui_helper_data.program) + 1 + sizeof(int);
18162 +}
18163 +
18164 +/**
18165 + * userui_save_config_info - Fill buffer with config info for image header.
18166 + *
18167 + * @buf: Buffer into which to put the config info we want to save.
18168 + */
18169 +static int userui_save_config_info(char *buf)
18170 +{
18171 +       *((int *) buf) = progress_granularity;
18172 +       memcpy(buf + sizeof(int), ui_helper_data.program,
18173 +                       sizeof(ui_helper_data.program));
18174 +       return sizeof(ui_helper_data.program) + sizeof(int) + 1;
18175 +}
18176 +
18177 +/**
18178 + * userui_load_config_info - Restore config info from buffer.
18179 + *
18180 + * @buf: Buffer containing header info loaded.
18181 + * @size: Size of data loaded for this module.
18182 + */
18183 +static void userui_load_config_info(char *buf, int size)
18184 +{
18185 +       progress_granularity = *((int *) buf);
18186 +       size -= sizeof(int);
18187 +
18188 +       /* Don't load the saved path if one has already been set */
18189 +       if (ui_helper_changed)
18190 +               return;
18191 +
18192 +       if (size > sizeof(ui_helper_data.program))
18193 +               size = sizeof(ui_helper_data.program);
18194 +
18195 +       memcpy(ui_helper_data.program, buf + sizeof(int), size);
18196 +       ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
18197 +}
18198 +
18199 +/**
18200 + * set_ui_program_set: Record that userui program was changed.
18201 + *
18202 + * Side effect routine for when the userui program is set. In an initrd or
18203 + * ramfs, the user may set a location for the userui program. If this happens,
18204 + * we don't want to reload the value that was saved in the image header. This
18205 + * routine allows us to flag that we shouldn't restore the program name from
18206 + * the image header.
18207 + */
18208 +static void set_ui_program_set(void)
18209 +{
18210 +       ui_helper_changed = 1;
18211 +}
18212 +
18213 +/**
18214 + * userui_memory_needed - Tell core how much memory to reserve for us.
18215 + */
18216 +static int userui_memory_needed(void)
18217 +{
18218 +       /* ball park figure of 128 pages */
18219 +       return (128 * PAGE_SIZE);
18220 +}
18221 +
18222 +/**
18223 + * userui_update_status - Update the progress bar and (if on) in-bar message.
18224 + *
18225 + * @value: Current progress percentage numerator.
18226 + * @maximum: Current progress percentage denominator.
18227 + * @fmt: Message to be displayed in the middle of the progress bar.
18228 + *
18229 + * Note that a NULL message does not mean that any previous message is erased!
18230 + * For that, you need toi_prepare_status with clearbar on.
18231 + *
18232 + * Returns an unsigned long, being the next numerator (as determined by the
18233 + * maximum and progress granularity) where status needs to be updated.
18234 + * This is to reduce unnecessary calls to update_status.
18235 + */
18236 +static unsigned long userui_update_status(unsigned long value,
18237 +               unsigned long maximum, const char *fmt, ...)
18238 +{
18239 +       static int last_step = -1;
18240 +       struct userui_msg_params msg;
18241 +       int bitshift;
18242 +       int this_step;
18243 +       unsigned long next_update;
18244 +
18245 +       if (ui_helper_data.pid == -1)
18246 +               return 0;
18247 +
18248 +       if ((!maximum) || (!progress_granularity))
18249 +               return maximum;
18250 +
18251 +       if (value < 0)
18252 +               value = 0;
18253 +
18254 +       if (value > maximum)
18255 +               value = maximum;
18256 +
18257 +       /* Try to avoid math problems - we can't do 64 bit math here
18258 +        * (and shouldn't need it - anyone got screen resolution
18259 +        * of 65536 pixels or more?) */
18260 +       bitshift = fls(maximum) - 16;
18261 +       if (bitshift > 0) {
18262 +               unsigned long temp_maximum = maximum >> bitshift;
18263 +               unsigned long temp_value = value >> bitshift;
18264 +               this_step = (int)
18265 +                       (temp_value * progress_granularity / temp_maximum);
18266 +               next_update = (((this_step + 1) * temp_maximum /
18267 +                                       progress_granularity) + 1) << bitshift;
18268 +       } else {
18269 +               this_step = (int) (value * progress_granularity / maximum);
18270 +               next_update = ((this_step + 1) * maximum /
18271 +                               progress_granularity) + 1;
18272 +       }
18273 +
18274 +       if (this_step == last_step)
18275 +               return next_update;
18276 +
18277 +       memset(&msg, 0, sizeof(msg));
18278 +
18279 +       msg.a = this_step;
18280 +       msg.b = progress_granularity;
18281 +
18282 +       if (fmt) {
18283 +               va_list args;
18284 +               va_start(args, fmt);
18285 +               vsnprintf(msg.text, sizeof(msg.text), fmt, args);
18286 +               va_end(args);
18287 +               msg.text[sizeof(msg.text)-1] = '\0';
18288 +       }
18289 +
18290 +       toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
18291 +                       &msg, sizeof(msg));
18292 +       last_step = this_step;
18293 +
18294 +       return next_update;
18295 +}
18296 +
18297 +/**
18298 + * userui_message - Display a message without necessarily logging it.
18299 + *
18300 + * @section: Type of message. Messages can be filtered by type.
18301 + * @level: Degree of importance of the message. Lower values = higher priority.
18302 + * @normally_logged: Whether logged even if log_everything is off.
18303 + * @fmt: Message (and parameters).
18304 + *
18305 + * This function is intended to do the same job as printk, but without normally
18306 + * logging what is printed. The point is to be able to get debugging info on
18307 + * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
18308 + *
18309 + * It may be called from an interrupt context - can't sleep!
18310 + */
18311 +static void userui_message(unsigned long section, unsigned long level,
18312 +               int normally_logged, const char *fmt, ...)
18313 +{
18314 +       struct userui_msg_params msg;
18315 +
18316 +       if ((level) && (level > console_loglevel))
18317 +               return;
18318 +
18319 +       memset(&msg, 0, sizeof(msg));
18320 +
18321 +       msg.a = section;
18322 +       msg.b = level;
18323 +       msg.c = normally_logged;
18324 +
18325 +       if (fmt) {
18326 +               va_list args;
18327 +               va_start(args, fmt);
18328 +               vsnprintf(msg.text, sizeof(msg.text), fmt, args);
18329 +               va_end(args);
18330 +               msg.text[sizeof(msg.text)-1] = '\0';
18331 +       }
18332 +
18333 +       if (test_action_state(TOI_LOGALL))
18334 +               printk(KERN_INFO "%s\n", msg.text);
18335 +
18336 +       toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
18337 +                       &msg, sizeof(msg));
18338 +}
18339 +
18340 +/**
18341 + * wait_for_key_via_userui - Wait for userui to receive a keypress.
18342 + */
18343 +static void wait_for_key_via_userui(void)
18344 +{
18345 +       DECLARE_WAITQUEUE(wait, current);
18346 +
18347 +       add_wait_queue(&userui_wait_for_key, &wait);
18348 +       set_current_state(TASK_INTERRUPTIBLE);
18349 +
18350 +       interruptible_sleep_on(&userui_wait_for_key);
18351 +
18352 +       set_current_state(TASK_RUNNING);
18353 +       remove_wait_queue(&userui_wait_for_key, &wait);
18354 +}
18355 +
18356 +/**
18357 + * userui_prepare_status - Display high level messages.
18358 + *
18359 + * @clearbar: Whether to clear the progress bar.
18360 + * @fmt...: New message for the title.
18361 + *
18362 + * Prepare the 'nice display', drawing the header and version, along with the
18363 + * current action and perhaps also resetting the progress bar.
18364 + */
18365 +static void userui_prepare_status(int clearbar, const char *fmt, ...)
18366 +{
18367 +       va_list args;
18368 +
18369 +       if (fmt) {
18370 +               va_start(args, fmt);
18371 +               lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
18372 +               va_end(args);
18373 +       }
18374 +
18375 +       if (clearbar)
18376 +               toi_update_status(0, 1, NULL);
18377 +
18378 +       if (ui_helper_data.pid == -1)
18379 +               printk(KERN_EMERG "%s\n", lastheader);
18380 +       else
18381 +               toi_message(0, TOI_STATUS, 1, lastheader, NULL);
18382 +}
18383 +
18384 +/**
18385 + * toi_wait_for_keypress - Wait for keypress via userui.
18386 + *
18387 + * @timeout: Maximum time to wait.
18388 + *
18389 + * Wait for a keypress from userui.
18390 + *
18391 + * FIXME: Implement timeout?
18392 + */
18393 +static char userui_wait_for_keypress(int timeout)
18394 +{
18395 +       char key = '\0';
18396 +
18397 +       if (ui_helper_data.pid != -1) {
18398 +               wait_for_key_via_userui();
18399 +               key = ' ';
18400 +       }
18401 +
18402 +       return key;
18403 +}
18404 +
18405 +/**
18406 + * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
18407 + *
18408 + * @result_code: Reason why we're aborting (1 << bit).
18409 + * @fmt: Message to display if telling the user what's going on.
18410 + *
18411 + * Abort a cycle. If this wasn't at the user's request (and we're displaying
18412 + * output), tell the user why and wait for them to acknowledge the message.
18413 + */
18414 +static void userui_abort_hibernate(int result_code, const char *fmt, ...)
18415 +{
18416 +       va_list args;
18417 +       int printed_len = 0;
18418 +
18419 +       set_result_state(result_code);
18420 +
18421 +       if (test_result_state(TOI_ABORTED))
18422 +               return;
18423 +
18424 +       set_result_state(TOI_ABORTED);
18425 +
18426 +       if (test_result_state(TOI_ABORT_REQUESTED))
18427 +               return;
18428 +
18429 +       va_start(args, fmt);
18430 +       printed_len = vsnprintf(local_printf_buf,  sizeof(local_printf_buf),
18431 +                       fmt, args);
18432 +       va_end(args);
18433 +       if (ui_helper_data.pid != -1)
18434 +               printed_len = sprintf(local_printf_buf + printed_len,
18435 +                                       " (Press SPACE to continue)");
18436 +
18437 +       toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
18438 +
18439 +       if (ui_helper_data.pid != -1)
18440 +               userui_wait_for_keypress(0);
18441 +}
18442 +
18443 +/**
18444 + * request_abort_hibernate - Abort hibernating or resuming at user request.
18445 + *
18446 + * Handle the user requesting the cancellation of a hibernation or resume by
18447 + * pressing escape.
18448 + */
18449 +static void request_abort_hibernate(void)
18450 +{
18451 +       if (test_result_state(TOI_ABORT_REQUESTED))
18452 +               return;
18453 +
18454 +       if (test_toi_state(TOI_NOW_RESUMING)) {
18455 +               toi_prepare_status(CLEAR_BAR, "Escape pressed. "
18456 +                                       "Powering down again.");
18457 +               set_toi_state(TOI_STOP_RESUME);
18458 +               while (!test_toi_state(TOI_IO_STOPPED))
18459 +                       schedule();
18460 +               if (toiActiveAllocator->mark_resume_attempted)
18461 +                       toiActiveAllocator->mark_resume_attempted(0);
18462 +               toi_power_down();
18463 +       }
18464 +
18465 +       toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
18466 +                                       " ABORTING HIBERNATION ---");
18467 +       set_abort_result(TOI_ABORT_REQUESTED);
18468 +       wake_up_interruptible(&userui_wait_for_key);
18469 +}
18470 +
18471 +/**
18472 + * userui_user_rcv_msg - Receive a netlink message from userui.
18473 + *
18474 + * @skb: skb received.
18475 + * @nlh: Netlink header received.
18476 + */
18477 +static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
18478 +{
18479 +       int type;
18480 +       int *data;
18481 +
18482 +       type = nlh->nlmsg_type;
18483 +
18484 +       /* A control message: ignore them */
18485 +       if (type < NETLINK_MSG_BASE)
18486 +               return 0;
18487 +
18488 +       /* Unknown message: reply with EINVAL */
18489 +       if (type >= USERUI_MSG_MAX)
18490 +               return -EINVAL;
18491 +
18492 +       /* All operations require privileges, even GET */
18493 +       if (security_netlink_recv(skb, CAP_NET_ADMIN))
18494 +               return -EPERM;
18495 +
18496 +       /* Only allow one task to receive NOFREEZE privileges */
18497 +       if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
18498 +               printk(KERN_INFO "Got NOFREEZE_ME request when "
18499 +                       "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
18500 +               return -EBUSY;
18501 +       }
18502 +
18503 +       data = (int *) NLMSG_DATA(nlh);
18504 +
18505 +       switch (type) {
18506 +       case USERUI_MSG_ABORT:
18507 +               request_abort_hibernate();
18508 +               return 0;
18509 +       case USERUI_MSG_GET_STATE:
18510 +               toi_send_netlink_message(&ui_helper_data,
18511 +                               USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
18512 +                               sizeof(toi_bkd.toi_action));
18513 +               return 0;
18514 +       case USERUI_MSG_GET_DEBUG_STATE:
18515 +               toi_send_netlink_message(&ui_helper_data,
18516 +                               USERUI_MSG_GET_DEBUG_STATE,
18517 +                               &toi_bkd.toi_debug_state,
18518 +                               sizeof(toi_bkd.toi_debug_state));
18519 +               return 0;
18520 +       case USERUI_MSG_SET_STATE:
18521 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18522 +                       return -EINVAL;
18523 +               ui_nl_set_state(*data);
18524 +               return 0;
18525 +       case USERUI_MSG_SET_DEBUG_STATE:
18526 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18527 +                       return -EINVAL;
18528 +               toi_bkd.toi_debug_state = (*data);
18529 +               return 0;
18530 +       case USERUI_MSG_SPACE:
18531 +               wake_up_interruptible(&userui_wait_for_key);
18532 +               return 0;
18533 +       case USERUI_MSG_GET_POWERDOWN_METHOD:
18534 +               toi_send_netlink_message(&ui_helper_data,
18535 +                               USERUI_MSG_GET_POWERDOWN_METHOD,
18536 +                               &toi_poweroff_method,
18537 +                               sizeof(toi_poweroff_method));
18538 +               return 0;
18539 +       case USERUI_MSG_SET_POWERDOWN_METHOD:
18540 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18541 +                       return -EINVAL;
18542 +               toi_poweroff_method = (*data);
18543 +               return 0;
18544 +       case USERUI_MSG_GET_LOGLEVEL:
18545 +               toi_send_netlink_message(&ui_helper_data,
18546 +                               USERUI_MSG_GET_LOGLEVEL,
18547 +                               &toi_bkd.toi_default_console_level,
18548 +                               sizeof(toi_bkd.toi_default_console_level));
18549 +               return 0;
18550 +       case USERUI_MSG_SET_LOGLEVEL:
18551 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18552 +                       return -EINVAL;
18553 +               toi_bkd.toi_default_console_level = (*data);
18554 +               return 0;
18555 +       case USERUI_MSG_PRINTK:
18556 +               printk((char *) data);
18557 +               return 0;
18558 +       }
18559 +
18560 +       /* Unhandled here */
18561 +       return 1;
18562 +}
18563 +
18564 +/**
18565 + * userui_cond_pause - Possibly pause at user request.
18566 + *
18567 + * @pause: Whether to pause or just display the message.
18568 + * @message: Message to display at the start of pausing.
18569 + *
18570 + * Potentially pause and wait for the user to tell us to continue. We normally
18571 + * only pause when @pause is set. While paused, the user can do things like
18572 + * changing the loglevel, toggling the display of debugging sections and such
18573 + * like.
18574 + */
18575 +static void userui_cond_pause(int pause, char *message)
18576 +{
18577 +       int displayed_message = 0, last_key = 0;
18578 +
18579 +       while (last_key != 32 &&
18580 +               ui_helper_data.pid != -1 &&
18581 +               ((test_action_state(TOI_PAUSE) && pause) ||
18582 +                (test_action_state(TOI_SINGLESTEP)))) {
18583 +               if (!displayed_message) {
18584 +                       toi_prepare_status(DONT_CLEAR_BAR,
18585 +                          "%s Press SPACE to continue.%s",
18586 +                          message ? message : "",
18587 +                          (test_action_state(TOI_SINGLESTEP)) ?
18588 +                          " Single step on." : "");
18589 +                       displayed_message = 1;
18590 +               }
18591 +               last_key = userui_wait_for_keypress(0);
18592 +       }
18593 +       schedule();
18594 +}
18595 +
18596 +/**
18597 + * userui_prepare_console - Prepare the console for use.
18598 + *
18599 + * Prepare a console for use, saving current kmsg settings and attempting to
18600 + * start userui. Console loglevel changes are handled by userui.
18601 + */
18602 +static void userui_prepare_console(void)
18603 +{
18604 +       orig_kmsg = kmsg_redirect;
18605 +       kmsg_redirect = fg_console + 1;
18606 +
18607 +       ui_helper_data.pid = -1;
18608 +
18609 +       if (!userui_ops.enabled) {
18610 +               printk("TuxOnIce: Userui disabled.\n");
18611 +               return;
18612 +       }
18613 +
18614 +       if (*ui_helper_data.program)
18615 +               toi_netlink_setup(&ui_helper_data);
18616 +       else
18617 +               printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
18618 +}
18619 +
18620 +/**
18621 + * userui_cleanup_console - Cleanup after a cycle.
18622 + *
18623 + * Tell userui to cleanup, and restore kmsg_redirect to its original value.
18624 + */
18625 +
18626 +static void userui_cleanup_console(void)
18627 +{
18628 +       if (ui_helper_data.pid > -1)
18629 +               toi_netlink_close(&ui_helper_data);
18630 +
18631 +       kmsg_redirect = orig_kmsg;
18632 +}
18633 +
18634 +/*
18635 + * User interface specific /sys/power/tuxonice entries.
18636 + */
18637 +
18638 +static struct toi_sysfs_data sysfs_params[] = {
18639 +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
18640 +       { TOI_ATTR("enable_escape", SYSFS_RW),
18641 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_CAN_CANCEL, 0)
18642 +       },
18643 +
18644 +       { TOI_ATTR("pause_between_steps", SYSFS_RW),
18645 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_PAUSE, 0)
18646 +       },
18647 +
18648 +       { TOI_ATTR("enabled", SYSFS_RW),
18649 +         SYSFS_INT(&userui_ops.enabled, 0, 1, 0)
18650 +       },
18651 +
18652 +       { TOI_ATTR("progress_granularity", SYSFS_RW),
18653 +         SYSFS_INT(&progress_granularity, 1, 2048, 0)
18654 +       },
18655 +
18656 +       { TOI_ATTR("program", SYSFS_RW),
18657 +         SYSFS_STRING(ui_helper_data.program, 255, 0),
18658 +         .write_side_effect = set_ui_program_set,
18659 +       },
18660 +#endif
18661 +};
18662 +
18663 +static struct toi_module_ops userui_ops = {
18664 +       .type                           = MISC_MODULE,
18665 +       .name                           = "userui",
18666 +       .shared_directory               = "user_interface",
18667 +       .module                         = THIS_MODULE,
18668 +       .storage_needed                 = userui_storage_needed,
18669 +       .save_config_info               = userui_save_config_info,
18670 +       .load_config_info               = userui_load_config_info,
18671 +       .memory_needed                  = userui_memory_needed,
18672 +       .sysfs_data                     = sysfs_params,
18673 +       .num_sysfs_entries              = sizeof(sysfs_params) /
18674 +               sizeof(struct toi_sysfs_data),
18675 +};
18676 +
18677 +static struct ui_ops my_ui_ops = {
18678 +       .post_atomic_restore            = userui_post_atomic_restore,
18679 +       .update_status                  = userui_update_status,
18680 +       .message                        = userui_message,
18681 +       .prepare_status                 = userui_prepare_status,
18682 +       .abort                          = userui_abort_hibernate,
18683 +       .cond_pause                     = userui_cond_pause,
18684 +       .prepare                        = userui_prepare_console,
18685 +       .cleanup                        = userui_cleanup_console,
18686 +       .wait_for_key                   = userui_wait_for_keypress,
18687 +};
18688 +
18689 +/**
18690 + * toi_user_ui_init - Boot time initialisation for user interface.
18691 + *
18692 + * Invoked from the core init routine.
18693 + */
18694 +static __init int toi_user_ui_init(void)
18695 +{
18696 +       int result;
18697 +
18698 +       ui_helper_data.nl = NULL;
18699 +       strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
18700 +       ui_helper_data.pid = -1;
18701 +       ui_helper_data.skb_size = sizeof(struct userui_msg_params);
18702 +       ui_helper_data.pool_limit = 6;
18703 +       ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
18704 +       ui_helper_data.name = "userspace ui";
18705 +       ui_helper_data.rcv_msg = userui_user_rcv_msg;
18706 +       ui_helper_data.interface_version = 7;
18707 +       ui_helper_data.must_init = 0;
18708 +       ui_helper_data.not_ready = userui_cleanup_console;
18709 +       init_completion(&ui_helper_data.wait_for_process);
18710 +       result = toi_register_module(&userui_ops);
18711 +       if (!result)
18712 +               result = toi_register_ui_ops(&my_ui_ops);
18713 +       if (result)
18714 +               toi_unregister_module(&userui_ops);
18715 +
18716 +       return result;
18717 +}
18718 +
18719 +#ifdef MODULE
18720 +/**
18721 + * toi_user_ui_ext - Cleanup code for if the core is unloaded.
18722 + */
18723 +static __exit void toi_user_ui_exit(void)
18724 +{
18725 +       toi_remove_ui_ops(&my_ui_ops);
18726 +       toi_unregister_module(&userui_ops);
18727 +}
18728 +
18729 +module_init(toi_user_ui_init);
18730 +module_exit(toi_user_ui_exit);
18731 +MODULE_AUTHOR("Nigel Cunningham");
18732 +MODULE_DESCRIPTION("TuxOnIce Userui Support");
18733 +MODULE_LICENSE("GPL");
18734 +#else
18735 +late_initcall(toi_user_ui_init);
18736 +#endif
18737 diff --git a/kernel/printk.c b/kernel/printk.c
18738 index bdd4ea8..cb95b05 100644
18739 --- a/kernel/printk.c
18740 +++ b/kernel/printk.c
18741 @@ -32,6 +32,7 @@
18742  #include <linux/security.h>
18743  #include <linux/bootmem.h>
18744  #include <linux/syscalls.h>
18745 +#include <linux/suspend.h>
18746  
18747  #include <asm/uaccess.h>
18748  
18749 @@ -99,9 +101,12 @@ static DEFINE_SPINLOCK(logbuf_lock);
18750   * The indices into log_buf are not constrained to log_buf_len - they
18751   * must be masked before subscripting
18752   */
18753 -static unsigned log_start;     /* Index into log_buf: next char to be read by syslog() */
18754 -static unsigned con_start;     /* Index into log_buf: next char to be sent to consoles */
18755 -static unsigned log_end;       /* Index into log_buf: most-recently-written-char + 1 */
18756 +/* Index into log_buf: next char to be read by syslog() */
18757 +static unsigned POSS_NOSAVE log_start;
18758 +/* Index into log_buf: next char to be sent to consoles */
18759 +static unsigned POSS_NOSAVE con_start;
18760 +/* Index into log_buf: most-recently-written-char + 1 */
18761 +static unsigned POSS_NOSAVE log_end;
18762  
18763  /*
18764   *     Array of consoles built from command line options (console=)
18765 @@ -124,10 +129,11 @@ static int console_may_schedule;
18766  
18767  #ifdef CONFIG_PRINTK
18768  
18769 -static char __log_buf[__LOG_BUF_LEN];
18770 -static char *log_buf = __log_buf;
18771 -static int log_buf_len = __LOG_BUF_LEN;
18772 -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
18773 +static POSS_NOSAVE char __log_buf[__LOG_BUF_LEN];
18774 +static POSS_NOSAVE char *log_buf = __log_buf;
18775 +static POSS_NOSAVE int log_buf_len = __LOG_BUF_LEN;
18776 +/* Number of chars produced since last read+clear operation */
18777 +static POSS_NOSAVE unsigned logged_chars;
18778  
18779  static int __init log_buf_len_setup(char *str)
18780  {
18781 @@ -927,6 +933,7 @@ void suspend_console(void)
18782         acquire_console_sem();
18783         console_suspended = 1;
18784  }
18785 +EXPORT_SYMBOL(suspend_console);
18786  
18787  void resume_console(void)
18788  {
18789 @@ -935,6 +942,7 @@ void resume_console(void)
18790         console_suspended = 0;
18791         release_console_sem();
18792  }
18793 +EXPORT_SYMBOL(resume_console);
18794  
18795  /**
18796   * acquire_console_sem - lock the console system for exclusive use.
18797 diff --git a/kernel/timer.c b/kernel/timer.c
18798 index b024106..0ff93c9 100644
18799 --- a/kernel/timer.c
18800 +++ b/kernel/timer.c
18801 @@ -37,6 +37,8 @@
18802  #include <linux/delay.h>
18803  #include <linux/tick.h>
18804  #include <linux/kallsyms.h>
18805 +#include <linux/notifier.h>
18806 +#include <linux/suspend.h>
18807  
18808  #include <asm/uaccess.h>
18809  #include <asm/unistd.h>
18810 @@ -876,6 +878,59 @@ unsigned long avenrun[3];
18811  
18812  EXPORT_SYMBOL(avenrun);
18813  
18814 +#ifdef CONFIG_PM
18815 +static unsigned long avenrun_save[3];
18816 +/*
18817 + * save_avenrun - Record the values prior to starting a hibernation cycle.
18818 + * We do this to make the work done in hibernation invisible to userspace
18819 + * post-suspend. Some programs, including some MTAs, watch the load average
18820 + * and stop work until it lowers. Without this, they would stop working for
18821 + * a while post-resume, unnecessarily.
18822 + */
18823 +
18824 +static void save_avenrun(void)
18825 +{
18826 +       avenrun_save[0] = avenrun[0];
18827 +       avenrun_save[1] = avenrun[1];
18828 +       avenrun_save[2] = avenrun[2];
18829 +}
18830 +
18831 +static void restore_avenrun(void)
18832 +{
18833 +       if (!avenrun_save[0])
18834 +               return;
18835 +
18836 +       avenrun[0] = avenrun_save[0];
18837 +       avenrun[1] = avenrun_save[1];
18838 +       avenrun[2] = avenrun_save[2];
18839 +
18840 +       avenrun_save[0] = 0;
18841 +}
18842 +
18843 +static int avenrun_pm_callback(struct notifier_block *nfb,
18844 +                                       unsigned long action,
18845 +                                       void *ignored)
18846 +{
18847 +       switch (action) {
18848 +       case PM_HIBERNATION_PREPARE:
18849 +               save_avenrun();
18850 +               return NOTIFY_OK;
18851 +       case PM_POST_HIBERNATION:
18852 +               restore_avenrun();
18853 +               return NOTIFY_OK;
18854 +       }
18855 +
18856 +       return NOTIFY_DONE;
18857 +}
18858 +
18859 +static void register_pm_notifier_callback(void)
18860 +{
18861 +       pm_notifier(avenrun_pm_callback, 0);
18862 +}
18863 +#else
18864 +static inline void register_pm_notifier_callback(void) { }
18865 +#endif
18866 +
18867  /*
18868   * calc_load - given tick count, update the avenrun load estimates.
18869   * This is called while holding a write_lock on xtime_lock.
18870 @@ -1374,6 +1429,7 @@ void __init init_timers(void)
18871         BUG_ON(err == NOTIFY_BAD);
18872         register_cpu_notifier(&timers_nb);
18873         open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
18874 +       register_pm_notifier_callback();
18875  }
18876  
18877  /**
18878 diff --git a/lib/vsprintf.c b/lib/vsprintf.c
18879 index 6021757..457c957 100644
18880 --- a/lib/vsprintf.c
18881 +++ b/lib/vsprintf.c
18882 @@ -482,6 +482,29 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
18883         return buf;
18884  }
18885  
18886 +/*
18887 + * vsnprintf_used
18888 + *
18889 + * Functionality    : Print a string with parameters to a buffer of a
18890 + *                    limited size. Unlike vsnprintf, we return the number
18891 + *                    of bytes actually put in the buffer, not the number
18892 + *                    that would have been put in if it was big enough.
18893 + */
18894 +int snprintf_used(char *buffer, int buffer_size, const char *fmt, ...)
18895 +{
18896 +       int result;
18897 +       va_list args;
18898 +
18899 +       if (!buffer_size)
18900 +               return 0;
18901 +
18902 +       va_start(args, fmt);
18903 +       result = vsnprintf(buffer, buffer_size, fmt, args);
18904 +       va_end(args);
18905 +
18906 +       return result > buffer_size ? buffer_size : result;
18907 +}
18908 +
18909  /**
18910   * vsnprintf - Format a string and place it in a buffer
18911   * @buf: The buffer to place the result into
18912 diff --git a/mm/Makefile b/mm/Makefile
18913 index a5b0dd9..ba0df45 100644
18914 --- a/mm/Makefile
18915 +++ b/mm/Makefile
18916 @@ -11,7 +11,7 @@ obj-y                 := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
18917                            page_alloc.o page-writeback.o pdflush.o \
18918                            readahead.o swap.o truncate.o vmscan.o \
18919                            prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
18920 -                          page_isolation.o $(mmu-y)
18921 +                          dyn_pageflags.o page_isolation.o $(mmu-y)
18922  
18923  obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
18924  obj-$(CONFIG_BOUNCE)   += bounce.o
18925 diff --git a/mm/dyn_pageflags.c b/mm/dyn_pageflags.c
18926 new file mode 100644
18927 index 0000000..30d95f0
18928 --- /dev/null
18929 +++ b/mm/dyn_pageflags.c
18930 @@ -0,0 +1,801 @@
18931 +/*
18932 + * lib/dyn_pageflags.c
18933 + *
18934 + * Copyright (C) 2004-2007 Nigel Cunningham <nigel at tuxonice net>
18935 + *
18936 + * This file is released under the GPLv2.
18937 + *
18938 + * Routines for dynamically allocating and releasing bitmaps
18939 + * used as pseudo-pageflags.
18940 + *
18941 + * We use bitmaps, built out of order zero allocations and
18942 + * linked together by kzalloc'd arrays of pointers into
18943 + * an array that looks like...
18944 + *
18945 + *     pageflags->bitmap[node][zone_id][page_num][ul]
18946 + *
18947 + * All of this is transparent to the caller, who just uses
18948 + * the allocate & free routines to create/destroy bitmaps,
18949 + * and get/set/clear to operate on individual flags.
18950 + *
18951 + * Bitmaps can be sparse, with the individual pages only being
18952 + * allocated when a bit is set in the page.
18953 + *
18954 + * Memory hotplugging support is work in progress. A zone's
18955 + * start_pfn may change. If it does, we need to reallocate
18956 + * the zone bitmap, adding additional pages to the front to
18957 + * cover the bitmap. For simplicity, we don't shift the
18958 + * contents of existing pages around. The lock is only used
18959 + * to avoid reentrancy when resizing zones. The replacement
18960 + * of old data with new is done atomically. If we try to test
18961 + * a bit in the new area before the update is completed, we
18962 + * know it's zero.
18963 + *
18964 + * TuxOnIce knows the structure of these pageflags, so that
18965 + * it can serialise them in the image header. TODO: Make
18966 + * that support more generic so that TuxOnIce doesn't need
18967 + * to know how dyn_pageflags are stored.
18968 + */
18969 +
18970 +/* Avoid warnings in include/linux/mm.h */
18971 +struct page;
18972 +struct dyn_pageflags;
18973 +int test_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
18974 +
18975 +#include <linux/bootmem.h>
18976 +#include <linux/dyn_pageflags.h>
18977 +#include <linux/module.h>
18978 +
18979 +static LIST_HEAD(flags_list);
18980 +static DEFINE_SPINLOCK(flags_list_lock);
18981 +
18982 +static void* (*dyn_allocator)(unsigned long size, unsigned long flags);
18983 +
18984 +static int dyn_pageflags_debug;
18985 +
18986 +#define PR_DEBUG(a, b...) \
18987 +       do { if (dyn_pageflags_debug) printk(a, ##b); } while (0)
18988 +#define DUMP_DEBUG(bitmap) \
18989 +       do { if (dyn_pageflags_debug) dump_pagemap(bitmap); } while (0)
18990 +
18991 +#if BITS_PER_LONG == 32
18992 +#define UL_SHIFT 5
18993 +#else
18994 +#if BITS_PER_LONG == 64
18995 +#define UL_SHIFT 6
18996 +#else
18997 +#error Bits per long not 32 or 64?
18998 +#endif
18999 +#endif
19000 +
19001 +#define BIT_NUM_MASK ((sizeof(unsigned long) << 3) - 1)
19002 +#define PAGE_NUM_MASK (~((1 << (PAGE_SHIFT + 3)) - 1))
19003 +#define UL_NUM_MASK (~(BIT_NUM_MASK | PAGE_NUM_MASK))
19004 +
19005 +/*
19006 + * PAGENUMBER gives the index of the page within the zone.
19007 + * PAGEINDEX gives the index of the unsigned long within that page.
19008 + * PAGEBIT gives the index of the bit within the unsigned long.
19009 + */
19010 +#define PAGENUMBER(zone_offset) ((int) (zone_offset >> (PAGE_SHIFT + 3)))
19011 +#define PAGEINDEX(zone_offset) ((int) ((zone_offset & UL_NUM_MASK) >> UL_SHIFT))
19012 +#define PAGEBIT(zone_offset) ((int) (zone_offset & BIT_NUM_MASK))
19013 +
19014 +#define PAGE_UL_PTR(bitmap, node, zone_num, zone_pfn) \
19015 +       ((bitmap[node][zone_num][PAGENUMBER(zone_pfn)])+PAGEINDEX(zone_pfn))
19016 +
19017 +#define pages_for_zone(zone) \
19018 +       (DIV_ROUND_UP((zone)->spanned_pages, (PAGE_SIZE << 3)))
19019 +
19020 +#define pages_for_span(span) \
19021 +       (DIV_ROUND_UP(span, PAGE_SIZE << 3))
19022 +
19023 +/* __maybe_unused for testing functions below */
19024 +#define GET_BIT_AND_UL(pageflags, page) \
19025 +       struct zone *zone = page_zone(page); \
19026 +       unsigned long pfn = page_to_pfn(page); \
19027 +       unsigned long zone_pfn = pfn - zone->zone_start_pfn; \
19028 +       int node = page_to_nid(page); \
19029 +       int zone_num = zone_idx(zone); \
19030 +       int pagenum = PAGENUMBER(zone_pfn) + 2; \
19031 +       int page_offset = PAGEINDEX(zone_pfn); \
19032 +       unsigned long **zone_array = ((pageflags)->bitmap && \
19033 +               (pageflags)->bitmap[node] && \
19034 +               (pageflags)->bitmap[node][zone_num]) ? \
19035 +                       (pageflags)->bitmap[node][zone_num] : NULL; \
19036 +       unsigned long __maybe_unused *ul = (zone_array && \
19037 +               (unsigned long) zone_array[0] <= pfn && \
19038 +               (unsigned long) zone_array[1] >= (pagenum-2) && \
19039 +               zone_array[pagenum]) ? zone_array[pagenum] + page_offset : \
19040 +                 NULL; \
19041 +       int bit __maybe_unused = PAGEBIT(zone_pfn);
19042 +
19043 +#define for_each_online_pgdat_zone(pgdat, zone_nr) \
19044 +       for_each_online_pgdat(pgdat) \
19045 +               for (zone_nr = 0; zone_nr < MAX_NR_ZONES; zone_nr++)
19046 +
19047 +/**
19048 + * dump_pagemap - Display the contents of a bitmap for debugging purposes.
19049 + *
19050 + * @pagemap: The array to be dumped.
19051 + */
19052 +void dump_pagemap(struct dyn_pageflags *pagemap)
19053 +{
19054 +       int i = 0;
19055 +       struct pglist_data *pgdat;
19056 +       unsigned long ****bitmap = pagemap->bitmap;
19057 +
19058 +       printk(" --- Dump bitmap %p ---\n", pagemap);
19059 +
19060 +       printk(KERN_INFO "%p: Sparse flag = %d\n",
19061 +                       &pagemap->sparse, pagemap->sparse);
19062 +       printk(KERN_INFO "%p: Bitmap      = %p\n",
19063 +                       &pagemap->bitmap, bitmap);
19064 +
19065 +       if (!bitmap)
19066 +               goto out;
19067 +
19068 +       for_each_online_pgdat(pgdat) {
19069 +               int node_id = pgdat->node_id, zone_nr;
19070 +               printk(KERN_INFO "%p: Node %d => %p\n",
19071 +                               &bitmap[node_id], node_id,
19072 +                               bitmap[node_id]);
19073 +               if (!bitmap[node_id])
19074 +                       continue;
19075 +               for (zone_nr = 0; zone_nr < MAX_NR_ZONES; zone_nr++) {
19076 +                       printk(KERN_INFO "%p:   Zone %d => %p%s\n",
19077 +                                       &bitmap[node_id][zone_nr], zone_nr,
19078 +                                       bitmap[node_id][zone_nr],
19079 +                                       bitmap[node_id][zone_nr] ? "" :
19080 +                                               " (empty)");
19081 +                       if (!bitmap[node_id][zone_nr])
19082 +                               continue;
19083 +
19084 +                       printk(KERN_INFO "%p:     Zone start pfn  = %p\n",
19085 +                                       &bitmap[node_id][zone_nr][0],
19086 +                                       bitmap[node_id][zone_nr][0]);
19087 +                       printk(KERN_INFO "%p:     Number of pages = %p\n",
19088 +                                       &bitmap[node_id][zone_nr][1],
19089 +                                       bitmap[node_id][zone_nr][1]);
19090 +                       for (i = 2; i < (unsigned long) bitmap[node_id]
19091 +                                       [zone_nr][1] + 2; i++)
19092 +                               printk(KERN_INFO
19093 +                                       "%p:     Page %2d         = %p\n",
19094 +                                       &bitmap[node_id][zone_nr][i],
19095 +                                       i - 2,
19096 +                                       bitmap[node_id][zone_nr][i]);
19097 +               }
19098 +       }
19099 +out:
19100 +       printk(KERN_INFO " --- Dump of bitmap %p finishes\n", pagemap);
19101 +}
19102 +EXPORT_SYMBOL_GPL(dump_pagemap);
19103 +
19104 +/**
19105 + * clear_dyn_pageflags - Zero all pageflags in a bitmap.
19106 + *
19107 + * @pagemap: The array to be cleared.
19108 + *
19109 + * Clear an array used to store dynamically allocated pageflags.
19110 + */
19111 +void clear_dyn_pageflags(struct dyn_pageflags *pagemap)
19112 +{
19113 +       int i = 0, zone_idx;
19114 +       struct pglist_data *pgdat;
19115 +       unsigned long ****bitmap = pagemap->bitmap;
19116 +
19117 +       for_each_online_pgdat_zone(pgdat, zone_idx) {
19118 +               int node_id = pgdat->node_id;
19119 +               struct zone *zone = &pgdat->node_zones[zone_idx];
19120 +
19121 +               if (!populated_zone(zone) ||
19122 +                  (!bitmap[node_id] || !bitmap[node_id][zone_idx]))
19123 +                       continue;
19124 +
19125 +               for (i = 2; i < pages_for_zone(zone) + 2; i++)
19126 +                       if (bitmap[node_id][zone_idx][i])
19127 +                               memset((bitmap[node_id][zone_idx][i]), 0,
19128 +                                               PAGE_SIZE);
19129 +       }
19130 +}
19131 +EXPORT_SYMBOL_GPL(clear_dyn_pageflags);
19132 +
19133 +/**
19134 + * Allocators.
19135 + *
19136 + * During boot time, we want to use alloc_bootmem_low. Afterwards, we want
19137 + * kzalloc. These routines let us do that without causing compile time warnings
19138 + * about mismatched sections, as would happen if we did a simple
19139 + * boot ? alloc_bootmem_low() : kzalloc() below.
19140 + */
19141 +
19142 +/**
19143 + * boot_time_allocator - Allocator used while booting.
19144 + *
19145 + * @size: Number of bytes wanted.
19146 + * @flags: Allocation flags (ignored here).
19147 + */
19148 +static __init void *boot_time_allocator(unsigned long size, unsigned long flags)
19149 +{
19150 +       return alloc_bootmem_low(size);
19151 +}
19152 +
19153 +/**
19154 + * normal_allocator - Allocator used post-boot.
19155 + *
19156 + * @size: Number of bytes wanted.
19157 + * @flags: Allocation flags.
19158 + *
19159 + * Allocate memory for our page flags.
19160 + */
19161 +static void *normal_allocator(unsigned long size, unsigned long flags)
19162 +{
19163 +       if (size == PAGE_SIZE)
19164 +               return (void *) get_zeroed_page(flags);
19165 +       else
19166 +               return kzalloc(size, flags);
19167 +}
19168 +
19169 +/**
19170 + * dyn_pageflags_init - Do the earliest initialisation.
19171 + *
19172 + * Very early in the boot process, set our allocator (alloc_bootmem_low) and
19173 + * allocate bitmaps for slab and buddy pageflags.
19174 + */
19175 +void __init dyn_pageflags_init(void)
19176 +{
19177 +       dyn_allocator = boot_time_allocator;
19178 +}
19179 +
19180 +/**
19181 + * dyn_pageflags_use_kzalloc - Reset the allocator for normal use.
19182 + *
19183 + * Reset the allocator to our normal, post boot function.
19184 + */
19185 +void __init dyn_pageflags_use_kzalloc(void)
19186 +{
19187 +       dyn_allocator = (void *) normal_allocator;
19188 +}
19189 +
19190 +/**
19191 + * try_alloc_dyn_pageflag_part - Try to allocate a pointer array.
19192 + *
19193 + * Try to allocate a contiguous array of pointers.
19194 + */
19195 +static int try_alloc_dyn_pageflag_part(int nr_ptrs, void **ptr)
19196 +{
19197 +       *ptr = (*dyn_allocator)(sizeof(void *) * nr_ptrs, GFP_ATOMIC);
19198 +
19199 +       if (*ptr)
19200 +               return 0;
19201 +
19202 +       printk(KERN_INFO
19203 +               "Error. Unable to allocate memory for dynamic pageflags.");
19204 +       return -ENOMEM;
19205 +}
19206 +
19207 +static int populate_bitmap_page(struct dyn_pageflags *pageflags, int take_lock,
19208 +                       unsigned long **page_ptr)
19209 +{
19210 +       void *address;
19211 +       unsigned long flags = 0;
19212 +
19213 +       if (take_lock)
19214 +               spin_lock_irqsave(&pageflags->struct_lock, flags);
19215 +
19216 +       /*
19217 +        * The page may have been allocated while we waited.
19218 +        */
19219 +       if (*page_ptr)
19220 +               goto out;
19221 +
19222 +       address = (*dyn_allocator)(PAGE_SIZE, GFP_ATOMIC);
19223 +
19224 +       if (!address) {
19225 +               PR_DEBUG("Error. Unable to allocate memory for "
19226 +                       "dynamic pageflags page.");
19227 +               if (pageflags)
19228 +                       spin_unlock_irqrestore(&pageflags->struct_lock, flags);
19229 +               return -ENOMEM;
19230 +       }
19231 +
19232 +       *page_ptr = address;
19233 +out:
19234 +       if (take_lock)
19235 +               spin_unlock_irqrestore(&pageflags->struct_lock, flags);
19236 +       return 0;
19237 +}
19238 +
19239 +/**
19240 + * resize_zone_bitmap - Resize the array of pages for a bitmap.
19241 + *
19242 + * Shrink or extend a list of pages for a zone in a bitmap, preserving
19243 + * existing data.
19244 + */
19245 +static int resize_zone_bitmap(struct dyn_pageflags *pagemap, struct zone *zone,
19246 +               unsigned long old_pages, unsigned long new_pages,
19247 +               unsigned long copy_offset, int take_lock)
19248 +{
19249 +       unsigned long **new_ptr = NULL, ****bitmap = pagemap->bitmap;
19250 +       int node_id = zone_to_nid(zone), zone_idx = zone_idx(zone),
19251 +           to_copy = min(old_pages, new_pages), result = 0;
19252 +       unsigned long **old_ptr = bitmap[node_id][zone_idx], i;
19253 +
19254 +       if (new_pages) {
19255 +               if (try_alloc_dyn_pageflag_part(new_pages + 2,
19256 +                                       (void **) &new_ptr))
19257 +                       return -ENOMEM;
19258 +
19259 +               if (old_pages)
19260 +                       memcpy(new_ptr + 2 + copy_offset, old_ptr + 2,
19261 +                                       sizeof(unsigned long) * to_copy);
19262 +
19263 +               new_ptr[0] = (void *) zone->zone_start_pfn;
19264 +               new_ptr[1] = (void *) new_pages;
19265 +       }
19266 +
19267 +       /* Free/alloc bitmap pages. */
19268 +       if (old_pages > new_pages) {
19269 +               for (i = new_pages + 2; i < old_pages + 2; i++)
19270 +                       if (old_ptr[i])
19271 +                               free_page((unsigned long) old_ptr[i]);
19272 +       } else if (!pagemap->sparse) {
19273 +               for (i = old_pages + 2; i < new_pages + 2; i++)
19274 +                       if (populate_bitmap_page(NULL, take_lock,
19275 +                                       (unsigned long **) &new_ptr[i])) {
19276 +                               result = -ENOMEM;
19277 +                               break;
19278 +                       }
19279 +       }
19280 +
19281 +       bitmap[node_id][zone_idx] = new_ptr;
19282 +       kfree(old_ptr);
19283 +       return result;
19284 +}
19285 +
19286 +/**
19287 + * check_dyn_pageflag_range - Resize a section of a dyn_pageflag array.
19288 + *
19289 + * @pagemap: The array to be worked on.
19290 + * @zone: The zone to get in sync with reality.
19291 + *
19292 + * Check the pagemap has correct allocations for the zone. This can be
19293 + * invoked when allocating a new bitmap, or for hot[un]plug, and so
19294 + * must deal with any disparities between zone_start_pfn/spanned_pages
19295 + * and what we have allocated. In addition, we must deal with the possibility
19296 + * of zone_start_pfn having changed.
19297 + */
19298 +int check_dyn_pageflag_zone(struct dyn_pageflags *pagemap, struct zone *zone,
19299 +               int force_free_all, int take_lock)
19300 +{
19301 +       int node_id = zone_to_nid(zone), zone_idx = zone_idx(zone);
19302 +       unsigned long copy_offset = 0, old_pages, new_pages;
19303 +       unsigned long **old_ptr = pagemap->bitmap[node_id][zone_idx];
19304 +
19305 +       old_pages = old_ptr ? (unsigned long) old_ptr[1] : 0;
19306 +       new_pages = force_free_all ? 0 : pages_for_span(zone->spanned_pages);
19307 +
19308 +       if (old_pages == new_pages &&
19309 +           (!old_pages || (unsigned long) old_ptr[0] == zone->zone_start_pfn))
19310 +               return 0;
19311 +
19312 +       if (old_pages &&
19313 +           (unsigned long) old_ptr[0] != zone->zone_start_pfn)
19314 +               copy_offset = pages_for_span((unsigned long) old_ptr[0] -
19315 +                                                       zone->zone_start_pfn);
19316 +
19317 +       /* New/expanded zone? */
19318 +       return resize_zone_bitmap(pagemap, zone, old_pages, new_pages,
19319 +                       copy_offset, take_lock);
19320 +}
19321 +
19322 +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
19323 +/**
19324 + * dyn_pageflags_hotplug - Add pages to bitmaps for hotplugged memory.
19325 + *
19326 + * Seek to expand bitmaps for hotplugged memory. We ignore any failure.
19327 + * Since we handle sparse bitmaps anyway, they'll be automatically
19328 + * populated as needed.
19329 + */
19330 +void dyn_pageflags_hotplug(struct zone *zone)
19331 +{
19332 +       struct dyn_pageflags *this;
19333 +
19334 +       list_for_each_entry(this, &flags_list, list)
19335 +               check_dyn_pageflag_zone(this, zone, 0, 1);
19336 +}
19337 +#endif
19338 +
19339 +/**
19340 + * free_dyn_pageflags - Free an array of dynamically allocated pageflags.
19341 + *
19342 + * @pagemap: The array to be freed.
19343 + *
19344 + * Free a dynamically allocated pageflags bitmap.
19345 + */
19346 +void free_dyn_pageflags(struct dyn_pageflags *pagemap)
19347 +{
19348 +       int zone_idx;
19349 +       struct pglist_data *pgdat;
19350 +       unsigned long flags;
19351 +
19352 +       DUMP_DEBUG(pagemap);
19353 +
19354 +       if (!pagemap->bitmap)
19355 +               return;
19356 +
19357 +       for_each_online_pgdat_zone(pgdat, zone_idx)
19358 +               check_dyn_pageflag_zone(pagemap,
19359 +                               &pgdat->node_zones[zone_idx], 1, 1);
19360 +
19361 +       for_each_online_pgdat(pgdat) {
19362 +               int i = pgdat->node_id;
19363 +
19364 +               if (pagemap->bitmap[i])
19365 +                       kfree((pagemap->bitmap)[i]);
19366 +       }
19367 +
19368 +       kfree(pagemap->bitmap);
19369 +       pagemap->bitmap = NULL;
19370 +
19371 +       pagemap->initialised = 0;
19372 +
19373 +       if (!pagemap->sparse) {
19374 +               spin_lock_irqsave(&flags_list_lock, flags);
19375 +               list_del_init(&pagemap->list);
19376 +               pagemap->sparse = 1;
19377 +               spin_unlock_irqrestore(&flags_list_lock, flags);
19378 +       }
19379 +}
19380 +EXPORT_SYMBOL_GPL(free_dyn_pageflags);
19381 +
19382 +/**
19383 + * allocate_dyn_pageflags - Allocate a bitmap.
19384 + *
19385 + * @pagemap: The bitmap we want to allocate.
19386 + * @sparse: Whether to make the array sparse.
19387 + *
19388 + * The array we're preparing. If sparse, we don't allocate the actual
19389 + * pages until they're needed. If not sparse, we add the bitmap to the
19390 + * list so that if we're supporting memory hotplugging, we can allocate
19391 + * new pages on hotplug events.
19392 + *
19393 + * This routine may be called directly, or indirectly when the first bit
19394 + * needs to be set on a previously unused bitmap.
19395 + */
19396 +int allocate_dyn_pageflags(struct dyn_pageflags *pagemap, int sparse)
19397 +{
19398 +       int zone_idx, result = -ENOMEM;
19399 +       struct zone *zone;
19400 +       struct pglist_data *pgdat;
19401 +       unsigned long flags;
19402 +
19403 +       if (!sparse && (pagemap->sparse || !pagemap->initialised)) {
19404 +               spin_lock_irqsave(&flags_list_lock, flags);
19405 +               list_add(&pagemap->list, &flags_list);
19406 +               spin_unlock_irqrestore(&flags_list_lock, flags);
19407 +       }
19408 +
19409 +       spin_lock_irqsave(&pagemap->struct_lock, flags);
19410 +
19411 +       pagemap->initialised = 1;
19412 +       pagemap->sparse = sparse;
19413 +
19414 +       if (!pagemap->bitmap && try_alloc_dyn_pageflag_part((1 << NODES_WIDTH),
19415 +                               (void **) &pagemap->bitmap))
19416 +               goto out;
19417 +
19418 +       for_each_online_pgdat(pgdat) {
19419 +               int node_id = pgdat->node_id;
19420 +
19421 +               if (!pagemap->bitmap[node_id] &&
19422 +                   try_alloc_dyn_pageflag_part(MAX_NR_ZONES,
19423 +                       (void **) &(pagemap->bitmap)[node_id]))
19424 +                               goto out;
19425 +
19426 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
19427 +                       zone = &pgdat->node_zones[zone_idx];
19428 +
19429 +                       if (populated_zone(zone) &&
19430 +                           check_dyn_pageflag_zone(pagemap, zone, 0, 0))
19431 +                               goto out;
19432 +               }
19433 +       }
19434 +
19435 +       result = 0;
19436 +
19437 +out:
19438 +       spin_unlock_irqrestore(&pagemap->struct_lock, flags);
19439 +       return result;
19440 +}
19441 +EXPORT_SYMBOL_GPL(allocate_dyn_pageflags);
19442 +
19443 +/**
19444 + * test_dynpageflag - Test a page in a bitmap.
19445 + *
19446 + * @bitmap: The bitmap we're checking.
19447 + * @page: The page for which we want to test the matching bit.
19448 + *
19449 + * Test whether the bit is on in the array. The array may be sparse,
19450 + * in which case the result is zero.
19451 + */
19452 +int test_dynpageflag(struct dyn_pageflags *bitmap, struct page *page)
19453 +{
19454 +       GET_BIT_AND_UL(bitmap, page);
19455 +       return ul ? test_bit(bit, ul) : 0;
19456 +}
19457 +EXPORT_SYMBOL_GPL(test_dynpageflag);
19458 +
19459 +/**
19460 + * set_dynpageflag - Set a bit in a bitmap.
19461 + *
19462 + * @bitmap: The bitmap we're operating on.
19463 + * @page: The page for which we want to set the matching bit.
19464 + *
19465 + * Set the associated bit in the array. If the array is sparse, we
19466 + * seek to allocate the missing page.
19467 + */
19468 +void set_dynpageflag(struct dyn_pageflags *pageflags, struct page *page)
19469 +{
19470 +       GET_BIT_AND_UL(pageflags, page);
19471 +
19472 +       if (!ul) {
19473 +               /*
19474 +                * Sparse, hotplugged or unprepared.
19475 +                * Allocate / fill gaps in high levels
19476 +                */
19477 +               if (allocate_dyn_pageflags(pageflags, 1) ||
19478 +                   populate_bitmap_page(pageflags, 1, (unsigned long **)
19479 +                               &pageflags->bitmap[node][zone_num][pagenum])) {
19480 +                       printk(KERN_EMERG "Failed to allocate storage in a "
19481 +                                       "sparse bitmap.\n");
19482 +                       dump_pagemap(pageflags);
19483 +                       BUG();
19484 +               }
19485 +               set_dynpageflag(pageflags, page);
19486 +       } else
19487 +               set_bit(bit, ul);
19488 +}
19489 +EXPORT_SYMBOL_GPL(set_dynpageflag);
19490 +
19491 +/**
19492 + * clear_dynpageflag - Clear a bit in a bitmap.
19493 + *
19494 + * @bitmap: The bitmap we're operating on.
19495 + * @page: The page for which we want to clear the matching bit.
19496 + *
19497 + * Clear the associated bit in the array. It is not an error to be asked
19498 + * to clear a bit on a page we haven't allocated.
19499 + */
19500 +void clear_dynpageflag(struct dyn_pageflags *bitmap, struct page *page)
19501 +{
19502 +       GET_BIT_AND_UL(bitmap, page);
19503 +       if (ul)
19504 +               clear_bit(bit, ul);
19505 +}
19506 +EXPORT_SYMBOL_GPL(clear_dynpageflag);
19507 +
19508 +/**
19509 + * get_next_bit_on - Get the next bit in a bitmap.
19510 + *
19511 + * @pageflags: The bitmap we're searching.
19512 + * @counter: The previous pfn. We always return a value > this.
19513 + *
19514 + * Given a pfn (possibly max_pfn+1), find the next pfn in the bitmap that
19515 + * is set. If there are no more flags set, return max_pfn+1.
19516 + */
19517 +unsigned long get_next_bit_on(struct dyn_pageflags *pageflags,
19518 +               unsigned long counter)
19519 +{
19520 +       struct page *page;
19521 +       struct zone *zone;
19522 +       unsigned long *ul = NULL;
19523 +       unsigned long zone_offset;
19524 +       int pagebit, zone_num, first = (counter == (max_pfn + 1)), node;
19525 +
19526 +       if (first)
19527 +               counter = first_online_pgdat()->node_zones->zone_start_pfn;
19528 +
19529 +       page = pfn_to_page(counter);
19530 +       zone = page_zone(page);
19531 +       node = zone->zone_pgdat->node_id;
19532 +       zone_num = zone_idx(zone);
19533 +       zone_offset = counter - zone->zone_start_pfn;
19534 +
19535 +       if (first)
19536 +               goto test;
19537 +
19538 +       do {
19539 +               zone_offset++;
19540 +
19541 +               if (zone_offset >= zone->spanned_pages) {
19542 +                       do {
19543 +                               zone = next_zone(zone);
19544 +                               if (!zone)
19545 +                                       return max_pfn + 1;
19546 +                       } while (!zone->spanned_pages);
19547 +
19548 +                       zone_num = zone_idx(zone);
19549 +                       node = zone->zone_pgdat->node_id;
19550 +                       zone_offset = 0;
19551 +               }
19552 +test:
19553 +               pagebit = PAGEBIT(zone_offset);
19554 +
19555 +               if (!pagebit || !ul) {
19556 +                       ul = pageflags->bitmap[node][zone_num]
19557 +                               [PAGENUMBER(zone_offset)+2];
19558 +                       if (ul)
19559 +                               ul += PAGEINDEX(zone_offset);
19560 +                       else {
19561 +                               PR_DEBUG("Unallocated page. Skipping from zone"
19562 +                                       " offset %lu to the start of the next "
19563 +                                       "one.\n", zone_offset);
19564 +                               zone_offset = roundup(zone_offset + 1,
19565 +                                               PAGE_SIZE << 3) - 1;
19566 +                               PR_DEBUG("New zone offset is %lu.\n",
19567 +                                               zone_offset);
19568 +                               continue;
19569 +                       }
19570 +               }
19571 +
19572 +               if (!ul || !(*ul & ~((1 << pagebit) - 1))) {
19573 +                       zone_offset += BITS_PER_LONG - pagebit - 1;
19574 +                       continue;
19575 +               }
19576 +
19577 +       } while (!ul || !test_bit(pagebit, ul));
19578 +
19579 +       return zone->zone_start_pfn + zone_offset;
19580 +}
19581 +EXPORT_SYMBOL_GPL(get_next_bit_on);
19582 +
19583 +#ifdef SELF_TEST
19584 +#include <linux/jiffies.h>
19585 +
19586 +static __init int dyn_pageflags_test(void)
19587 +{
19588 +       struct dyn_pageflags test_map;
19589 +       struct page *test_page1 = pfn_to_page(1);
19590 +       unsigned long pfn = 0, start, end;
19591 +       int i, iterations;
19592 +
19593 +       memset(&test_map, 0, sizeof(test_map));
19594 +
19595 +       printk("Dynpageflags testing...\n");
19596 +
19597 +       printk(KERN_INFO "Set page 1...");
19598 +       set_dynpageflag(&test_map, test_page1);
19599 +       if (test_dynpageflag(&test_map, test_page1))
19600 +               printk(KERN_INFO "Ok.\n");
19601 +       else
19602 +               printk(KERN_INFO "FAILED.\n");
19603 +
19604 +       printk(KERN_INFO "Test memory hotplugging #1 ...");
19605 +       {
19606 +               unsigned long orig_size;
19607 +               GET_BIT_AND_UL(&test_map, test_page1);
19608 +               orig_size = (unsigned long) test_map.bitmap[node][zone_num][1];
19609 +               /*
19610 +                * Use the code triggered when zone_start_pfn lowers,
19611 +                * checking that our bit is then set in the third page.
19612 +                */
19613 +               resize_zone_bitmap(&test_map, zone, orig_size,
19614 +                               orig_size + 2, 2);
19615 +               DUMP_DEBUG(&test_map);
19616 +               if ((unsigned long) test_map.bitmap[node][zone_num]
19617 +                               [pagenum + 2] &&
19618 +                   (unsigned long) test_map.bitmap[node][zone_num]
19619 +                               [pagenum + 2][0] == 2UL)
19620 +                       printk(KERN_INFO "Ok.\n");
19621 +               else
19622 +                       printk(KERN_INFO "FAILED.\n");
19623 +       }
19624 +
19625 +       printk(KERN_INFO "Test memory hotplugging #2 ...");
19626 +       {
19627 +               /*
19628 +                * Test expanding bitmap length.
19629 +                */
19630 +               unsigned long orig_size;
19631 +               GET_BIT_AND_UL(&test_map, test_page1);
19632 +               orig_size = (unsigned long) test_map.bitmap[node]
19633 +                                                       [zone_num][1];
19634 +               resize_zone_bitmap(&test_map, zone, orig_size,
19635 +                               orig_size + 2, 0);
19636 +               DUMP_DEBUG(&test_map);
19637 +               pagenum += 2; /* Offset for first test */
19638 +               if (test_map.bitmap[node][zone_num][pagenum] &&
19639 +                   test_map.bitmap[node][zone_num][pagenum][0] == 2UL &&
19640 +                   (unsigned long) test_map.bitmap[node][zone_num][1] ==
19641 +                                               orig_size + 2)
19642 +                       printk(KERN_INFO "Ok.\n");
19643 +               else
19644 +                       printk(KERN_INFO "FAILED ([%d][%d][%d]: %p && %lu == "
19645 +                               "2UL  && %p == %lu).\n",
19646 +                               node, zone_num, pagenum,
19647 +                               test_map.bitmap[node][zone_num][pagenum],
19648 +                               test_map.bitmap[node][zone_num][pagenum] ?
19649 +                               test_map.bitmap[node][zone_num][pagenum][0] : 0,
19650 +                               test_map.bitmap[node][zone_num][1],
19651 +                               orig_size + 2);
19652 +       }
19653 +
19654 +       free_dyn_pageflags(&test_map);
19655 +
19656 +       allocate_dyn_pageflags(&test_map, 0);
19657 +
19658 +       start = jiffies;
19659 +
19660 +       iterations = 25000000 / max_pfn;
19661 +
19662 +       for (i = 0; i < iterations; i++) {
19663 +               for (pfn = 0; pfn < max_pfn; pfn++)
19664 +                       set_dynpageflag(&test_map, pfn_to_page(pfn));
19665 +               for (pfn = 0; pfn < max_pfn; pfn++)
19666 +                       clear_dynpageflag(&test_map, pfn_to_page(pfn));
19667 +       }
19668 +
19669 +       end = jiffies;
19670 +
19671 +       free_dyn_pageflags(&test_map);
19672 +
19673 +       printk(KERN_INFO "Dyn: %d iterations of setting & clearing all %lu "
19674 +                       "flags took %lu jiffies.\n",
19675 +                       iterations, max_pfn, end - start);
19676 +
19677 +       start = jiffies;
19678 +
19679 +       for (i = 0; i < iterations; i++) {
19680 +               for (pfn = 0; pfn < max_pfn; pfn++)
19681 +                       set_bit(7, &(pfn_to_page(pfn))->flags);
19682 +               for (pfn = 0; pfn < max_pfn; pfn++)
19683 +                       clear_bit(7, &(pfn_to_page(pfn))->flags);
19684 +       }
19685 +
19686 +       end = jiffies;
19687 +
19688 +       printk(KERN_INFO "Real flags: %d iterations of setting & clearing "
19689 +                       "all %lu flags took %lu jiffies.\n",
19690 +                       iterations, max_pfn, end - start);
19691 +
19692 +       iterations = 25000000;
19693 +
19694 +       start = jiffies;
19695 +
19696 +       for (i = 0; i < iterations; i++) {
19697 +               set_dynpageflag(&test_map, pfn_to_page(1));
19698 +               clear_dynpageflag(&test_map, pfn_to_page(1));
19699 +       }
19700 +
19701 +       end = jiffies;
19702 +
19703 +       printk(KERN_INFO "Dyn: %d iterations of setting & clearing all one "
19704 +                       "flag took %lu jiffies.\n", iterations, end - start);
19705 +
19706 +       start = jiffies;
19707 +
19708 +       for (i = 0; i < iterations; i++) {
19709 +               set_bit(7, &(pfn_to_page(1))->flags);
19710 +               clear_bit(7, &(pfn_to_page(1))->flags);
19711 +       }
19712 +
19713 +       end = jiffies;
19714 +
19715 +       printk(KERN_INFO "Real pageflag: %d iterations of setting & clearing "
19716 +                       "all one flag took %lu jiffies.\n",
19717 +                       iterations, end - start);
19718 +       return 0;
19719 +}
19720 +
19721 +late_initcall(dyn_pageflags_test);
19722 +#endif
19723 +
19724 +static int __init dyn_pageflags_debug_setup(char *str)
19725 +{
19726 +       printk(KERN_INFO "Dynamic pageflags debugging enabled.\n");
19727 +       dyn_pageflags_debug = 1;
19728 +       return 1;
19729 +}
19730 +
19731 +__setup("dyn_pageflags_debug", dyn_pageflags_debug_setup);
19732 diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
19733 index 7469c50..280a8b8 100644
19734 --- a/mm/memory_hotplug.c
19735 +++ b/mm/memory_hotplug.c
19736 @@ -77,6 +77,8 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
19737         }
19738         memmap_init_zone(nr_pages, nid, zone_type,
19739                          phys_start_pfn, MEMMAP_HOTPLUG);
19740 +
19741 +       dyn_pageflags_hotplug(zone);
19742         return 0;
19743  }
19744  
19745 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
19746 index 402a504..7dd7430 100644
19747 --- a/mm/page_alloc.c
19748 +++ b/mm/page_alloc.c
19749 @@ -1730,6 +1730,26 @@ static unsigned int nr_free_zone_pages(int offset)
19750         return sum;
19751  }
19752  
19753 +static unsigned int nr_unallocated_zone_pages(int offset)
19754 +{
19755 +       /* Just pick one node, since fallback list is circular */
19756 +       pg_data_t *pgdat = NODE_DATA(numa_node_id());
19757 +       unsigned int sum = 0;
19758 +
19759 +       struct zonelist *zonelist = pgdat->node_zonelists + offset;
19760 +       struct zone **zonep = zonelist->zones;
19761 +       struct zone *zone;
19762 +
19763 +       for (zone = *zonep++; zone; zone = *zonep++) {
19764 +               unsigned long high = zone->pages_high;
19765 +               unsigned long left = zone_page_state(zone, NR_FREE_PAGES);
19766 +               if (left > high)
19767 +                       sum += left - high;
19768 +       }
19769 +
19770 +       return sum;
19771 +}
19772 +
19773  /*
19774   * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
19775   */
19776 @@ -1740,6 +1760,15 @@ unsigned int nr_free_buffer_pages(void)
19777  EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
19778  
19779  /*
19780 + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
19781 + */
19782 +unsigned int nr_unallocated_buffer_pages(void)
19783 +{
19784 +       return nr_unallocated_zone_pages(gfp_zone(GFP_USER));
19785 +}
19786 +EXPORT_SYMBOL_GPL(nr_unallocated_buffer_pages);
19787 +
19788 +/*
19789   * Amount of free RAM allocatable within all zones
19790   */
19791  unsigned int nr_free_pagecache_pages(void)
19792 diff --git a/mm/vmscan.c b/mm/vmscan.c
19793 index 4046434..718226e 100644
19794 --- a/mm/vmscan.c
19795 +++ b/mm/vmscan.c
19796 @@ -776,6 +776,28 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
19797         return nr_taken;
19798  }
19799  
19800 +/* return_lru_pages puts a list of pages back on a zone's lru lists. */
19801 +
19802 +static void return_lru_pages(struct list_head *page_list, struct zone *zone,
19803 +               struct pagevec *pvec)
19804 +{
19805 +       while (!list_empty(page_list)) {
19806 +               struct page *page = lru_to_page(page_list);
19807 +               VM_BUG_ON(PageLRU(page));
19808 +               SetPageLRU(page);
19809 +               list_del(&page->lru);
19810 +               if (PageActive(page))
19811 +                       add_page_to_active_list(zone, page);
19812 +               else
19813 +                       add_page_to_inactive_list(zone, page);
19814 +               if (!pagevec_add(pvec, page)) {
19815 +                       spin_unlock_irq(&zone->lru_lock);
19816 +                       __pagevec_release(pvec);
19817 +                       spin_lock_irq(&zone->lru_lock);
19818 +               }
19819 +       }
19820 +}
19821 +
19822  static unsigned long isolate_pages_global(unsigned long nr,
19823                                         struct list_head *dst,
19824                                         unsigned long *scanned, int order,
19825 @@ -826,7 +848,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
19826         lru_add_drain();
19827         spin_lock_irq(&zone->lru_lock);
19828         do {
19829 -               struct page *page;
19830                 unsigned long nr_taken;
19831                 unsigned long nr_scan;
19832                 unsigned long nr_freed;
19833 @@ -888,21 +909,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
19834                 /*
19835                  * Put back any unfreeable pages.
19836                  */
19837 -               while (!list_empty(&page_list)) {
19838 -                       page = lru_to_page(&page_list);
19839 -                       VM_BUG_ON(PageLRU(page));
19840 -                       SetPageLRU(page);
19841 -                       list_del(&page->lru);
19842 -                       if (PageActive(page))
19843 -                               add_page_to_active_list(zone, page);
19844 -                       else
19845 -                               add_page_to_inactive_list(zone, page);
19846 -                       if (!pagevec_add(&pvec, page)) {
19847 -                               spin_unlock_irq(&zone->lru_lock);
19848 -                               __pagevec_release(&pvec);
19849 -                               spin_lock_irq(&zone->lru_lock);
19850 -                       }
19851 -               }
19852 +               return_lru_pages(&page_list, zone, &pvec);
19853         } while (nr_scanned < max_scan);
19854         spin_unlock(&zone->lru_lock);
19855  done:
19856 @@ -1625,6 +1632,72 @@ out:
19857         return nr_reclaimed;
19858  }
19859  
19860 +struct lru_save {
19861 +       struct zone             *zone;
19862 +       struct list_head        active_list;
19863 +       struct list_head        inactive_list;
19864 +       struct lru_save         *next;
19865 +};
19866 +
19867 +struct lru_save *lru_save_list;
19868 +
19869 +void unlink_lru_lists(void)
19870 +{
19871 +       struct zone *zone;
19872 +
19873 +       for_each_zone(zone) {
19874 +               struct lru_save *this;
19875 +               unsigned long moved, scanned;
19876 +
19877 +               if (!zone->spanned_pages)
19878 +                       continue;
19879 +
19880 +               this = (struct lru_save *)
19881 +                       kzalloc(sizeof(struct lru_save), GFP_ATOMIC);
19882 +
19883 +               BUG_ON(!this);
19884 +
19885 +               this->next = lru_save_list;
19886 +               lru_save_list = this;
19887 +
19888 +               this->zone = zone;
19889 +
19890 +               spin_lock_irq(&zone->lru_lock);
19891 +               INIT_LIST_HEAD(&this->active_list);
19892 +               INIT_LIST_HEAD(&this->inactive_list);
19893 +               moved = isolate_lru_pages(zone_page_state(zone, NR_ACTIVE),
19894 +                               &zone->active_list, &this->active_list,
19895 +                               &scanned, 0, ISOLATE_BOTH);
19896 +               __mod_zone_page_state(zone, NR_ACTIVE, -moved);
19897 +               moved = isolate_lru_pages(zone_page_state(zone, NR_INACTIVE),
19898 +                               &zone->inactive_list, &this->inactive_list,
19899 +                               &scanned, 0, ISOLATE_BOTH);
19900 +               __mod_zone_page_state(zone, NR_INACTIVE, -moved);
19901 +               spin_unlock_irq(&zone->lru_lock);
19902 +       }
19903 +}
19904 +
19905 +void relink_lru_lists(void)
19906 +{
19907 +       while (lru_save_list) {
19908 +               struct lru_save *this = lru_save_list;
19909 +               struct zone *zone = this->zone;
19910 +               struct pagevec pvec;
19911 +
19912 +               pagevec_init(&pvec, 1);
19913 +
19914 +               lru_save_list = this->next;
19915 +
19916 +               spin_lock_irq(&zone->lru_lock);
19917 +               return_lru_pages(&this->active_list, zone, &pvec);
19918 +               return_lru_pages(&this->inactive_list, zone, &pvec);
19919 +               spin_unlock_irq(&zone->lru_lock);
19920 +               pagevec_release(&pvec);
19921 +
19922 +               kfree(this);
19923 +       }
19924 +}
19925 +
19926  /*
19927   * The background pageout daemon, started as a kernel thread
19928   * from the init process. 
19929 @@ -1710,6 +1783,9 @@ void wakeup_kswapd(struct zone *zone, int order)
19930         if (!populated_zone(zone))
19931                 return;
19932  
19933 +       if (freezer_is_on())
19934 +               return;
19935 +
19936         pgdat = zone->zone_pgdat;
19937         if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
19938                 return;
19939 @@ -1723,6 +1799,109 @@ void wakeup_kswapd(struct zone *zone, int order)
19940  }
19941  
19942  #ifdef CONFIG_PM
19943 +static unsigned long shrink_ps1_zone(struct zone *zone,
19944 +       unsigned long total_to_free, struct scan_control sc)
19945 +{
19946 +       unsigned long freed = 0;
19947 +
19948 +       while (total_to_free > freed) {
19949 +               unsigned long nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
19950 +               struct reclaim_state reclaim_state;
19951 +
19952 +               if (nr_slab > total_to_free)
19953 +                       nr_slab = total_to_free;
19954 +
19955 +               reclaim_state.reclaimed_slab = 0;
19956 +               shrink_slab(nr_slab, sc.gfp_mask, nr_slab);
19957 +               if (!reclaim_state.reclaimed_slab)
19958 +                       return freed;
19959 +
19960 +               freed += reclaim_state.reclaimed_slab;
19961 +       }
19962 +
19963 +       return freed;
19964 +}
19965 +
19966 +unsigned long shrink_ps2_zone(struct zone *zone, unsigned long total_to_free,
19967 +               struct scan_control sc)
19968 +{
19969 +       int prio;
19970 +       unsigned long freed = 0;
19971 +       if (!populated_zone(zone) || zone_is_all_unreclaimable(zone))
19972 +               return 0;
19973 +
19974 +       for (prio = DEF_PRIORITY; prio >= 0; prio--) {
19975 +               unsigned long to_free, just_freed, orig_size;
19976 +               unsigned long old_nr_active;
19977 +
19978 +               to_free = min(zone_page_state(zone, NR_ACTIVE) +
19979 +                               zone_page_state(zone, NR_INACTIVE),
19980 +                               total_to_free - freed);
19981 +
19982 +               if (to_free <= 0)
19983 +                       return freed;
19984 +
19985 +               sc.swap_cluster_max = to_free -
19986 +                       zone_page_state(zone, NR_INACTIVE);
19987 +
19988 +               do {
19989 +                       old_nr_active = zone_page_state(zone, NR_ACTIVE);
19990 +                       zone->nr_scan_active = sc.swap_cluster_max - 1;
19991 +                       shrink_active_list(sc.swap_cluster_max, zone, &sc,
19992 +                                       prio);
19993 +                       zone->nr_scan_active = 0;
19994 +
19995 +                       sc.swap_cluster_max = to_free - zone_page_state(zone,
19996 +                                       NR_INACTIVE);
19997 +
19998 +               } while (sc.swap_cluster_max > 0 &&
19999 +                        zone_page_state(zone, NR_ACTIVE) > old_nr_active);
20000 +
20001 +               to_free = min(zone_page_state(zone, NR_ACTIVE) +
20002 +                               zone_page_state(zone, NR_INACTIVE),
20003 +                               total_to_free - freed);
20004 +
20005 +               do {
20006 +                       orig_size = zone_page_state(zone, NR_ACTIVE) +
20007 +                               zone_page_state(zone, NR_INACTIVE);
20008 +                       zone->nr_scan_inactive = to_free;
20009 +                       sc.swap_cluster_max = to_free;
20010 +                       shrink_inactive_list(to_free, zone, &sc);
20011 +                       just_freed = (orig_size -
20012 +                               (zone_page_state(zone, NR_ACTIVE) +
20013 +                                zone_page_state(zone, NR_INACTIVE)));
20014 +                       zone->nr_scan_inactive = 0;
20015 +                       freed += just_freed;
20016 +               } while (just_freed > 0 && freed < total_to_free);
20017 +       }
20018 +
20019 +       return freed;
20020 +}
20021 +
20022 +void shrink_one_zone(struct zone *zone, unsigned long total_to_free,
20023 +               int ps_wanted)
20024 +{
20025 +       unsigned long freed = 0;
20026 +       struct scan_control sc = {
20027 +               .gfp_mask = GFP_KERNEL,
20028 +               .may_swap = 0,
20029 +               .may_writepage = 1,
20030 +               .swappiness = vm_swappiness,
20031 +               .isolate_pages = isolate_pages_global,
20032 +       };
20033 +
20034 +       if (total_to_free <= 0)
20035 +               return;
20036 +
20037 +       if (is_highmem(zone))
20038 +               sc.gfp_mask |= __GFP_HIGHMEM;
20039 +
20040 +       if (ps_wanted & 2)
20041 +               freed = shrink_ps2_zone(zone, total_to_free, sc);
20042 +       if (ps_wanted & 1)
20043 +               shrink_ps1_zone(zone, total_to_free - freed, sc);
20044 +}
20045 +
20046  /*
20047   * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
20048   * from LRU lists system-wide, for given pass and priority, and returns the
This page took 1.491116 seconds and 3 git commands to generate.