]> git.pld-linux.org Git - packages/kernel.git/blob - linux-2.6-suspend2.patch
- rel.3 - rebuild with fixed vserver patch
[packages/kernel.git] / linux-2.6-suspend2.patch
1 diff --git a/Documentation/power/tuxonice-internals.txt b/Documentation/power/tuxonice-internals.txt
2 new file mode 100644
3 index 0000000..68c0454
4 --- /dev/null
5 +++ b/Documentation/power/tuxonice-internals.txt
6 @@ -0,0 +1,474 @@
7 +                  TuxOnIce 3.0 Internal Documentation.
8 +                       Updated to 11 March 2008
9 +
10 +1.  Introduction.
11 +
12 +    TuxOnIce 3.0 is an addition to the Linux Kernel, designed to
13 +    allow the user to quickly shutdown and quickly boot a computer, without
14 +    needing to close documents or programs. It is equivalent to the
15 +    hibernate facility in some laptops. This implementation, however,
16 +    requires no special BIOS or hardware support.
17 +
18 +    The code in these files is based upon the original implementation
19 +    prepared by Gabor Kuti and additional work by Pavel Machek and a
20 +    host of others. This code has been substantially reworked by Nigel
21 +    Cunningham, again with the help and testing of many others, not the
22 +    least of whom is Michael Frank. At its heart, however, the operation is
23 +    essentially the same as Gabor's version.
24 +
25 +2.  Overview of operation.
26 +
27 +    The basic sequence of operations is as follows:
28 +
29 +       a. Quiesce all other activity.
30 +       b. Ensure enough memory and storage space are available, and attempt
31 +          to free memory/storage if necessary.
32 +       c. Allocate the required memory and storage space.
33 +       d. Write the image.
34 +       e. Power down.
35 +
36 +    There are a number of complicating factors which mean that things are
37 +    not as simple as the above would imply, however...
38 +
39 +    o The activity of each process must be stopped at a point where it will
40 +    not be holding locks necessary for saving the image, or unexpectedly
41 +    restart operations due to something like a timeout and thereby make
42 +    our image inconsistent.
43 +
44 +    o It is desirous that we sync outstanding I/O to disk before calculating
45 +    image statistics. This reduces corruption if one should suspend but
46 +    then not resume, and also makes later parts of the operation safer (see
47 +    below).
48 +
49 +    o We need to get as close as we can to an atomic copy of the data.
50 +    Inconsistencies in the image will result in inconsistent memory contents at
51 +    resume time, and thus in instability of the system and/or file system
52 +    corruption. This would appear to imply a maximum image size of one half of
53 +    the amount of RAM, but we have a solution... (again, below).
54 +
55 +    o In 2.6, we choose to play nicely with the other suspend-to-disk
56 +    implementations.
57 +
58 +3.  Detailed description of internals.
59 +
60 +    a. Quiescing activity.
61 +
62 +    Safely quiescing the system is achieved using three separate but related
63 +    aspects.
64 +
65 +    First, we note that the vast majority of processes don't need to run during
66 +    suspend. They can be 'frozen'. We therefore implement a refrigerator
67 +    routine, which processes enter and in which they remain until the cycle is
68 +    complete. Processes enter the refrigerator via try_to_freeze() invocations
69 +    at appropriate places.  A process cannot be frozen in any old place. It
70 +    must not be holding locks that will be needed for writing the image or
71 +    freezing other processes. For this reason, userspace processes generally
72 +    enter the refrigerator via the signal handling code, and kernel threads at
73 +    the place in their event loops where they drop locks and yield to other
74 +    processes or sleep.
75 +
76 +    The task of freezing processes is complicated by the fact that there can be
77 +    interdependencies between processes. Freezing process A before process B may
78 +    mean that process B cannot be frozen, because it stops at waiting for
79 +    process A rather than in the refrigerator. This issue is seen where
80 +    userspace waits on freezeable kernel threads or fuse filesystem threads. To
81 +    address this issue, we implement the following algorithm for quiescing
82 +    activity:
83 +
84 +       - Freeze filesystems (including fuse - userspace programs starting
85 +               new requests are immediately frozen; programs already running
86 +               requests complete their work before being frozen in the next
87 +               step)
88 +       - Freeze userspace
89 +       - Thaw filesystems (this is safe now that userspace is frozen and no
90 +               fuse requests are outstanding).
91 +       - Invoke sys_sync (noop on fuse).
92 +       - Freeze filesystems
93 +       - Freeze kernel threads
94 +
95 +    If we need to free memory, we thaw kernel threads and filesystems, but not
96 +    userspace. We can then free caches without worrying about deadlocks due to
97 +    swap files being on frozen filesystems or such like.
98 +
99 +    b. Ensure enough memory & storage are available.
100 +
101 +    We have a number of constraints to meet in order to be able to successfully
102 +    suspend and resume.
103 +
104 +    First, the image will be written in two parts, described below. One of these
105 +    parts needs to have an atomic copy made, which of course implies a maximum
106 +    size of one half of the amount of system memory. The other part ('pageset')
107 +    is not atomically copied, and can therefore be as large or small as desired.
108 +
109 +    Second, we have constraints on the amount of storage available. In these
110 +    calculations, we may also consider any compression that will be done. The
111 +    cryptoapi module allows the user to configure an expected compression ratio.
112 +   
113 +    Third, the user can specify an arbitrary limit on the image size, in
114 +    megabytes. This limit is treated as a soft limit, so that we don't fail the
115 +    attempt to suspend if we cannot meet this constraint.
116 +
117 +    c. Allocate the required memory and storage space.
118 +
119 +    Having done the initial freeze, we determine whether the above constraints
120 +    are met, and seek to allocate the metadata for the image. If the constraints
121 +    are not met, or we fail to allocate the required space for the metadata, we
122 +    seek to free the amount of memory that we calculate is needed and try again.
123 +    We allow up to four iterations of this loop before aborting the cycle. If we
124 +    do fail, it should only be because of a bug in TuxOnIce's calculations.
125 +    
126 +    These steps are merged together in the prepare_image function, found in
127 +    prepare_image.c. The functions are merged because of the cyclical nature
128 +    of the problem of calculating how much memory and storage is needed. Since
129 +    the data structures containing the information about the image must
130 +    themselves take memory and use storage, the amount of memory and storage
131 +    required changes as we prepare the image. Since the changes are not large,
132 +    only one or two iterations will be required to achieve a solution.
133 +
134 +    The recursive nature of the algorithm is miminised by keeping user space
135 +    frozen while preparing the image, and by the fact that our records of which
136 +    pages are to be saved and which pageset they are saved in use bitmaps (so
137 +    that changes in number or fragmentation of the pages to be saved don't
138 +    feedback via changes in the amount of memory needed for metadata). The
139 +    recursiveness is thus limited to any extra slab pages allocated to store the
140 +    extents that record storage used, and the effects of seeking to free memory.
141 +
142 +    d. Write the image.
143 +
144 +    We previously mentioned the need to create an atomic copy of the data, and
145 +    the half-of-memory limitation that is implied in this. This limitation is
146 +    circumvented by dividing the memory to be saved into two parts, called
147 +    pagesets.
148 +
149 +    Pageset2 contains the page cache - the pages on the active and inactive
150 +    lists. These pages aren't needed or modifed while TuxOnIce is running, so
151 +    they can be safely written without an atomic copy. They are therefore
152 +    saved first and reloaded last. While saving these pages, TuxOnIce carefully
153 +    ensures that the work of writing the pages doesn't make the image
154 +    inconsistent.
155 +
156 +    Once pageset2 has been saved, we prepare to do the atomic copy of remaining
157 +    memory. As part of the preparation, we power down drivers, thereby providing
158 +    them with the opportunity to have their state recorded in the image. The
159 +    amount of memory allocated by drivers for this is usually negligible, but if
160 +    DRI is in use, video drivers may require significants amounts. Ideally we
161 +    would be able to query drivers while preparing the image as to the amount of
162 +    memory they will need. Unfortunately no such mechanism exists at the time of
163 +    writing. For this reason, TuxOnIce allows the user to set an
164 +    'extra_pages_allowance', which is used to seek to ensure sufficient memory
165 +    is available for drivers at this point. TuxOnIce also lets the user set this
166 +    value to 0. In this case, a test driver suspend is done while preparing the
167 +    image, and the difference (plus a margin) used instead.
168 +
169 +    Having suspended the drivers, we save the CPU context before making an
170 +    atomic copy of pageset1, resuming the drivers and saving the atomic copy.
171 +    After saving the two pagesets, we just need to save our metadata before
172 +    powering down.
173 +
174 +    As we mentioned earlier, the contents of pageset2 pages aren't needed once
175 +    they've been saved. We therefore use them as the destination of our atomic
176 +    copy. In the unlikely event that pageset1 is larger, extra pages are
177 +    allocated while the image is being prepared. This is normally only a real
178 +    possibility when the system has just been booted and the page cache is
179 +    small.
180 +
181 +    This is where we need to be careful about syncing, however. Pageset2 will
182 +    probably contain filesystem meta data. If this is overwritten with pageset1
183 +    and then a sync occurs, the filesystem will be corrupted - at least until
184 +    resume time and another sync of the restored data. Since there is a
185 +    possibility that the user might not resume or (may it never be!) that
186 +    suspend might oops, we do our utmost to avoid syncing filesystems after
187 +    copying pageset1.
188 +
189 +    e. Power down.
190 +
191 +    Powering down uses standard kernel routines. TuxOnIce supports powering down
192 +    using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off.
193 +    Supporting suspend to ram (S3) as a power off option might sound strange,
194 +    but it allows the user to quickly get their system up and running again if
195 +    the battery doesn't run out (we just need to re-read the overwritten pages)
196 +    and if the battery does run out (or the user removes power), they can still
197 +    resume.
198 +
199 +4.  Data Structures.
200 +
201 +    TuxOnIce uses three main structures to store its metadata and configuration
202 +    information:
203 +
204 +    a) Pageflags bitmaps.
205 +
206 +    TuxOnIce records which pages will be in pageset1, pageset2, the destination
207 +    of the atomic copy and the source of the atomically restored image using
208 +    bitmaps. These bitmaps are created from order zero allocations to maximise
209 +    reliability. The individual pages are combined together with pointers to
210 +    form per-zone bitmaps, which are in turn combined with another layer of
211 +    pointers to construct the overall bitmap.
212 +
213 +    The pageset1 bitmap is thus easily stored in the image header for use at
214 +    resume time.
215 +
216 +    As mentioned above, using bitmaps also means that the amount of memory and
217 +    storage required for recording the above information is constant. This
218 +    greatly simplifies the work of preparing the image. In earlier versions of
219 +    TuxOnIce, extents were used to record which pages would be stored. In that
220 +    case, however, eating memory could result in greater fragmentation of the
221 +    lists of pages, which in turn required more memory to store the extents and
222 +    more storage in the image header. These could in turn require further
223 +    freeing of memory, and another iteration. All of this complexity is removed
224 +    by having bitmaps.
225 +
226 +    Bitmaps also make a lot of sense because TuxOnIce only ever iterates
227 +    through the lists. There is therefore no cost to not being able to find the
228 +    nth page in order 0 time. We only need to worry about the cost of finding
229 +    the n+1th page, given the location of the nth page. Bitwise optimisations
230 +    help here.
231 +
232 +    The data structure is: unsigned long ***.
233 +
234 +    b) Extents for block data.
235 +
236 +    TuxOnIce supports writing the image to multiple block devices. In the case
237 +    of swap, multiple partitions and/or files may be in use, and we happily use
238 +    them all. This is accomplished as follows:
239 +
240 +    Whatever the actual source of the allocated storage, the destination of the
241 +    image can be viewed in terms of one or more block devices, and on each
242 +    device, a list of sectors. To simplify matters, we only use contiguous,
243 +    PAGE_SIZE aligned sectors, like the swap code does.
244 +
245 +    Since sector numbers on each bdev may well not start at 0, it makes much
246 +    more sense to use extents here. Contiguous ranges of pages can thus be
247 +    represented in the extents by contiguous values.
248 +
249 +    Variations in block size are taken account of in transforming this data
250 +    into the parameters for bio submission.
251 +
252 +    We can thus implement a layer of abstraction wherein the core of TuxOnIce
253 +    doesn't have to worry about which device we're currently writing to or
254 +    where in the device we are. It simply requests that the next page in the
255 +    pageset or header be written, leaving the details to this lower layer.
256 +    The lower layer remembers where in the sequence of devices and blocks each
257 +    pageset starts. The header always starts at the beginning of the allocated
258 +    storage.
259 +
260 +    So extents are:
261 +
262 +    struct extent {
263 +      unsigned long minimum, maximum;
264 +      struct extent *next;
265 +    }
266 +
267 +    These are combined into chains of extents for a device:
268 +
269 +    struct extent_chain {
270 +      int size; /* size of the extent ie sum (max-min+1) */
271 +      int allocs, frees;
272 +      char *name;
273 +      struct extent *first, *last_touched;
274 +    };
275 +
276 +    For each bdev, we need to store a little more info:
277 +
278 +    struct suspend_bdev_info {
279 +       struct block_device *bdev;
280 +       dev_t dev_t;
281 +       int bmap_shift;
282 +       int blocks_per_page;
283 +    };
284 +
285 +    The dev_t is used to identify the device in the stored image. As a result,
286 +    we expect devices at resume time to have the same major and minor numbers
287 +    as they had while suspending.  This is primarily a concern where the user
288 +    utilises LVM for storage, as they will need to dmsetup their partitions in
289 +    such a way as to maintain this consistency at resume time.
290 +
291 +    bmap_shift and blocks_per_page record apply the effects of variations in
292 +    blocks per page settings for the filesystem and underlying bdev. For most
293 +    filesystems, these are the same, but for xfs, they can have independant
294 +    values.
295 +
296 +    Combining these two structures together, we have everything we need to
297 +    record what devices and what blocks on each device are being used to
298 +    store the image, and to submit i/o using bio_submit.
299 +
300 +    The last elements in the picture are a means of recording how the storage
301 +    is being used.
302 +
303 +    We do this first and foremost by implementing a layer of abstraction on
304 +    top of the devices and extent chains which allows us to view however many
305 +    devices there might be as one long storage tape, with a single 'head' that
306 +    tracks a 'current position' on the tape:
307 +
308 +    struct extent_iterate_state {
309 +      struct extent_chain *chains;
310 +      int num_chains;
311 +      int current_chain;
312 +      struct extent *current_extent;
313 +      unsigned long current_offset;
314 +    };
315 +
316 +    That is, *chains points to an array of size num_chains of extent chains.
317 +    For the filewriter, this is always a single chain. For the swapwriter, the
318 +    array is of size MAX_SWAPFILES.
319 +
320 +    current_chain, current_extent and current_offset thus point to the current
321 +    index in the chains array (and into a matching array of struct
322 +    suspend_bdev_info), the current extent in that chain (to optimise access),
323 +    and the current value in the offset.
324 +
325 +    The image is divided into three parts:
326 +    - The header
327 +    - Pageset 1
328 +    - Pageset 2
329 +
330 +    The header always starts at the first device and first block. We know its
331 +    size before we begin to save the image because we carefully account for
332 +    everything that will be stored in it.
333 +
334 +    The second pageset (LRU) is stored first. It begins on the next page after
335 +    the end of the header.
336 +
337 +    The first pageset is stored second. It's start location is only known once
338 +    pageset2 has been saved, since pageset2 may be compressed as it is written.
339 +    This location is thus recorded at the end of saving pageset2. It is page
340 +    aligned also.
341 +
342 +    Since this information is needed at resume time, and the location of extents
343 +    in memory will differ at resume time, this needs to be stored in a portable
344 +    way:
345 +
346 +    struct extent_iterate_saved_state {
347 +        int chain_num;
348 +        int extent_num;
349 +        unsigned long offset;
350 +    };
351 +
352 +    We can thus implement a layer of abstraction wherein the core of TuxOnIce
353 +    doesn't have to worry about which device we're currently writing to or
354 +    where in the device we are. It simply requests that the next page in the
355 +    pageset or header be written, leaving the details to this layer, and
356 +    invokes the routines to remember and restore the position, without having
357 +    to worry about the details of how the data is arranged on disk or such like.
358 +
359 +    c) Modules
360 +
361 +    One aim in designing TuxOnIce was to make it flexible. We wanted to allow
362 +    for the implementation of different methods of transforming a page to be
363 +    written to disk and different methods of getting the pages stored.
364 +
365 +    In early versions (the betas and perhaps Suspend1), compression support was
366 +    inlined in the image writing code, and the data structures and code for
367 +    managing swap were intertwined with the rest of the code. A number of people
368 +    had expressed interest in implementing image encryption, and alternative
369 +    methods of storing the image.
370 +
371 +    In order to achieve this, TuxOnIce was given a modular design.
372 +
373 +    A module is a single file which encapsulates the functionality needed
374 +    to transform a pageset of data (encryption or compression, for example),
375 +    or to write the pageset to a device. The former type of module is called
376 +    a 'page-transformer', the later a 'writer'.
377 +
378 +    Modules are linked together in pipeline fashion. There may be zero or more
379 +    page transformers in a pipeline, and there is always exactly one writer.
380 +    The pipeline follows this pattern:
381 +
382 +               ---------------------------------
383 +               |          TuxOnIce Core        |
384 +               ---------------------------------
385 +                               |
386 +                               |
387 +               ---------------------------------
388 +               |       Page transformer 1      |
389 +               ---------------------------------
390 +                               |
391 +                               |
392 +               ---------------------------------
393 +               |       Page transformer 2      |
394 +               ---------------------------------
395 +                               |
396 +                               |
397 +               ---------------------------------
398 +               |            Writer             |
399 +               ---------------------------------
400 +
401 +    During the writing of an image, the core code feeds pages one at a time
402 +    to the first module. This module performs whatever transformations it
403 +    implements on the incoming data, completely consuming the incoming data and
404 +    feeding output in a similar manner to the next module. A module may buffer
405 +    its output.
406 +
407 +    All routines are SMP safe, and the final result of the transformations is
408 +    written with an index (provided by the core) and size of the output by the
409 +    writer. As a result, we can have multithreaded I/O without needing to
410 +    worry about the sequence in which pages are written (or read).
411 +
412 +    During reading, the pipeline works in the reverse direction. The core code
413 +    calls the first module with the address of a buffer which should be filled.
414 +    (Note that the buffer size is always PAGE_SIZE at this time). This module
415 +    will in turn request data from the next module and so on down until the
416 +    writer is made to read from the stored image.
417 +
418 +    Part of definition of the structure of a module thus looks like this:
419 +
420 +        int (*rw_init) (int rw, int stream_number);
421 +        int (*rw_cleanup) (int rw);
422 +        int (*write_chunk) (struct page *buffer_page);
423 +        int (*read_chunk) (struct page *buffer_page, int sync);
424 +
425 +    It should be noted that the _cleanup routine may be called before the
426 +    full stream of data has been read or written. While writing the image,
427 +    the user may (depending upon settings) choose to abort suspending, and
428 +    if we are in the midst of writing the last portion of the image, a portion
429 +    of the second pageset may be reread. This may also happen if an error
430 +    occurs and we seek to abort the process of writing the image.
431 +
432 +    The modular design is also useful in a number of other ways. It provides
433 +    a means where by we can add support for:
434 +
435 +    - providing overall initialisation and cleanup routines;
436 +    - serialising configuration information in the image header;
437 +    - providing debugging information to the user;
438 +    - determining memory and image storage requirements;
439 +    - dis/enabling components at run-time;
440 +    - configuring the module (see below);
441 +
442 +    ...and routines for writers specific to their work:
443 +    - Parsing a resume= location;
444 +    - Determining whether an image exists;
445 +    - Marking a resume as having been attempted;
446 +    - Invalidating an image;
447 +
448 +    Since some parts of the core - the user interface and storage manager
449 +    support - have use for some of these functions, they are registered as
450 +    'miscellaneous' modules as well.
451 +
452 +    d) Sysfs data structures.
453 +
454 +    This brings us naturally to support for configuring TuxOnIce. We desired to
455 +    provide a way to make TuxOnIce as flexible and configurable as possible.
456 +    The user shouldn't have to reboot just because they want to now suspend to
457 +    a file instead of a partition, for example.
458 +
459 +    To accomplish this, TuxOnIce implements a very generic means whereby the
460 +    core and modules can register new sysfs entries. All TuxOnIce entries use
461 +    a single _store and _show routine, both of which are found in sysfs.c in
462 +    the kernel/power directory. These routines handle the most common operations
463 +    - getting and setting the values of bits, integers, longs, unsigned longs
464 +    and strings in one place, and allow overrides for customised get and set
465 +    options as well as side-effect routines for all reads and writes.
466 +
467 +    When combined with some simple macros, a new sysfs entry can then be defined
468 +    in just a couple of lines:
469 +
470 +    { TOI_ATTR("progress_granularity", SYSFS_RW),
471 +      SYSFS_INT(&progress_granularity, 1, 2048)
472 +    },
473 +
474 +    This defines a sysfs entry named "progress_granularity" which is rw and
475 +    allows the user to access an integer stored at &progress_granularity, giving
476 +    it a value between 1 and 2048 inclusive.
477 +
478 +    Sysfs entries are registered under /sys/power/tuxonice, and entries for
479 +    modules are located in a subdirectory named after the module.
480 +
481 diff --git a/Documentation/power/tuxonice.txt b/Documentation/power/tuxonice.txt
482 new file mode 100644
483 index 0000000..d13ce85
484 --- /dev/null
485 +++ b/Documentation/power/tuxonice.txt
486 @@ -0,0 +1,750 @@
487 +       --- TuxOnIce, version 3.0 ---
488 +
489 +1.  What is it?
490 +2.  Why would you want it?
491 +3.  What do you need to use it?
492 +4.  Why not just use the version already in the kernel?
493 +5.  How do you use it?
494 +6.  What do all those entries in /sys/power/tuxonice do?
495 +7.  How do you get support?
496 +8.  I think I've found a bug. What should I do?
497 +9.  When will XXX be supported?
498 +10  How does it work?
499 +11. Who wrote TuxOnIce?
500 +
501 +1. What is it?
502 +
503 +   Imagine you're sitting at your computer, working away. For some reason, you
504 +   need to turn off your computer for a while - perhaps it's time to go home
505 +   for the day. When you come back to your computer next, you're going to want
506 +   to carry on where you left off. Now imagine that you could push a button and
507 +   have your computer store the contents of its memory to disk and power down.
508 +   Then, when you next start up your computer, it loads that image back into
509 +   memory and you can carry on from where you were, just as if you'd never
510 +   turned the computer off. You have far less time to start up, no reopening of
511 +   applications or finding what directory you put that file in yesterday.
512 +   That's what TuxOnIce does.
513 +
514 +   TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who,
515 +   with some help from Pavel Machek, got an early version going in 1999. The
516 +   project was then taken over by Florent Chabaud while still in alpha version
517 +   numbers. Nigel Cunningham came on the scene when Florent was unable to
518 +   continue, moving the project into betas, then 1.0, 2.0 and so on up to
519 +   the present series. During the 2.0 series, the name was contracted to
520 +   Suspend2 and the website suspend2.net created. Beginning around July 2007,
521 +   a transition to calling the software TuxOnIce was made, to seek to help
522 +   make it clear that TuxOnIce is more concerned with hibernation than suspend
523 +   to ram.
524 +
525 +   Pavel Machek's swsusp code, which was merged around 2.5.17 retains the
526 +   original name, and was essentially a fork of the beta code until Rafael
527 +   Wysocki came on the scene in 2005 and began to improve it further.
528 +
529 +2. Why would you want it?
530 +
531 +   Why wouldn't you want it?
532 +   
533 +   Being able to save the state of your system and quickly restore it improves
534 +   your productivity - you get a useful system in far less time than through
535 +   the normal boot process. You also get to be completely 'green', using zero
536 +   power, or as close to that as possible (the computer may still provide
537 +   minimal power to some devices, so they can initiate a power on, but that
538 +   will be the same amount of power as would be used if you told the computer
539 +   to shutdown.
540 +   
541 +3. What do you need to use it?
542 +
543 +   a. Kernel Support.
544 +
545 +   i) The TuxOnIce patch.
546 +   
547 +   TuxOnIce is part of the Linux Kernel. This version is not part of Linus's
548 +   2.6 tree at the moment, so you will need to download the kernel source and
549 +   apply the latest patch. Having done that, enable the appropriate options in
550 +   make [menu|x]config (under Power Management Options - look for "Enhanced
551 +   Hibernation"), compile and install your kernel. TuxOnIce works with SMP,
552 +   Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64.
553 +
554 +   TuxOnIce patches are available from http://tuxonice.net.
555 +
556 +   ii) Compression support.
557 +
558 +   Compression support is implemented via the cryptoapi. You will therefore want
559 +   to select any Cryptoapi transforms that you want to use on your image from
560 +   the Cryptoapi menu while configuring your kernel. Part of the TuxOnIce patch
561 +   adds a new cryptoapi compression called LZF. We recommend the use of this
562 +   compression method - it is very fast and still achieves good compression.
563 +
564 +   You can also tell TuxOnIce to write it's image to an encrypted and/or
565 +   compressed filesystem/swap partition. In that case, you don't need to do
566 +   anything special for TuxOnIce when it comes to kernel configuration.
567 +
568 +   iii) Configuring other options.
569 +
570 +   While you're configuring your kernel, try to configure as much as possible
571 +   to build as modules. We recommend this because there are a number of drivers
572 +   that are still in the process of implementing proper power management
573 +   support. In those cases, the best way to work around their current lack is
574 +   to build them as modules and remove the modules while hibernating. You might
575 +   also bug the driver authors to get their support up to speed, or even help!
576 +
577 +   b. Storage.
578 +
579 +   i) Swap.
580 +
581 +   TuxOnIce can store the hibernation image in your swap partition, a swap file or
582 +   a combination thereof. Whichever combination you choose, you will probably
583 +   want to create enough swap space to store the largest image you could have,
584 +   plus the space you'd normally use for swap. A good rule of thumb would be
585 +   to calculate the amount of swap you'd want without using TuxOnIce, and then
586 +   add the amount of memory you have. This swapspace can be arranged in any way
587 +   you'd like. It can be in one partition or file, or spread over a number. The
588 +   only requirement is that they be active when you start a hibernation cycle.
589 +   
590 +   There is one exception to this requirement. TuxOnIce has the ability to turn
591 +   on one swap file or partition at the start of hibernating and turn it back off
592 +   at the end. If you want to ensure you have enough memory to store a image
593 +   when your memory is fully used, you might want to make one swap partition or
594 +   file for 'normal' use, and another for TuxOnIce to activate & deactivate
595 +   automatically. (Further details below).
596 +
597 +   ii) Normal files.
598 +
599 +   TuxOnIce includes a 'file allocator'. The file allocator can store your
600 +   image in a simple file. Since Linux has the concept of everything being a
601 +   file, this is more powerful than it initially sounds. If, for example, you
602 +   were to set up a network block device file, you could hibernate to a network
603 +   server. This has been tested and works to a point, but nbd itself isn't
604 +   stateless enough for our purposes.
605 +
606 +   Take extra care when setting up the file allocator. If you just type
607 +   commands without thinking and then try to hibernate, you could cause
608 +   irreversible corruption on your filesystems! Make sure you have backups.
609 +
610 +   Most people will only want to hibernate to a local file. To achieve that, do
611 +   something along the lines of:
612 +
613 +   echo "TuxOnIce" > /hibernation-file
614 +   dd if=/dev/zero bs=1M count=512 >> hibernation-file
615 +
616 +   This will create a 512MB file called /hibernation-file. To get TuxOnIce to use
617 +   it:
618 +
619 +   echo /hibernation-file > /sys/power/tuxonice/file/target
620 +
621 +   Then
622 +
623 +   cat /sys/power/tuxonice/resume
624 +
625 +   Put the results of this into your bootloader's configuration (see also step
626 +   C, below):
627 +
628 +   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
629 +   # cat /sys/power/tuxonice/resume
630 +   file:/dev/hda2:0x1e001
631 +   
632 +   In this example, we would edit the append= line of our lilo.conf|menu.lst
633 +   so that it included:
634 +
635 +   resume=file:/dev/hda2:0x1e001
636 +   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
637
638 +   For those who are thinking 'Could I make the file sparse?', the answer is
639 +   'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in
640 +   a sparse file while hibernating. In the longer term (post merge!), I'd like
641 +   to change things so that the file could be dynamically resized and have
642 +   holes filled as needed. Right now, however, that's not possible and not a
643 +   priority.
644 +
645 +   c. Bootloader configuration.
646 +   
647 +   Using TuxOnIce also requires that you add an extra parameter to 
648 +   your lilo.conf or equivalent. Here's an example for a swap partition:
649 +
650 +   append="resume=swap:/dev/hda1"
651 +
652 +   This would tell TuxOnIce that /dev/hda1 is a swap partition you 
653 +   have. TuxOnIce will use the swap signature of this partition as a
654 +   pointer to your data when you hibernate. This means that (in this example)
655 +   /dev/hda1 doesn't need to be _the_ swap partition where all of your data
656 +   is actually stored. It just needs to be a swap partition that has a
657 +   valid signature.
658 +
659 +   You don't need to have a swap partition for this purpose. TuxOnIce
660 +   can also use a swap file, but usage is a little more complex. Having made
661 +   your swap file, turn it on and do 
662 +
663 +   cat /sys/power/tuxonice/swap/headerlocations
664 +
665 +   (this assumes you've already compiled your kernel with TuxOnIce
666 +   support and booted it). The results of the cat command will tell you
667 +   what you need to put in lilo.conf:
668 +
669 +   For swap partitions like /dev/hda1, simply use resume=/dev/hda1.
670 +   For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d.
671 +
672 +   If the swapfile changes for any reason (it is moved to a different
673 +   location, it is deleted and recreated, or the filesystem is
674 +   defragmented) then you will have to check
675 +   /sys/power/tuxonice/swap/headerlocations for a new resume_block value.
676 +
677 +   Once you've compiled and installed the kernel and adjusted your bootloader
678 +   configuration, you should only need to reboot for the most basic part
679 +   of TuxOnIce to be ready.
680 +
681 +   If you only compile in the swap allocator, or only compile in the file
682 +   allocator, you don't need to add the "swap:" part of the resume=
683 +   parameters above. resume=/dev/hda2:0x242d will work just as well. If you
684 +   have compiled both and your storage is on swap, you can also use this
685 +   format (the swap allocator is the default allocator).
686 +
687 +   When compiling your kernel, one of the options in the 'Power Management
688 +   Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is
689 +   called 'Default resume partition'. This can be used to set a default value
690 +   for the resume= parameter.
691 +
692 +   d. The hibernate script.
693 +
694 +   Since the driver model in 2.6 kernels is still being developed, you may need
695 +   to do more than just configure TuxOnIce. Users of TuxOnIce usually start the
696 +   process via a script which prepares for the hibernation cycle, tells the
697 +   kernel to do its stuff and then restore things afterwards. This script might
698 +   involve:
699 +
700 +   - Switching to a text console and back if X doesn't like the video card
701 +     status on resume.
702 +   - Un/reloading drivers that don't play well with hibernation.
703 +  
704 +   Note that you might not be able to unload some drivers if there are 
705 +   processes using them. You might have to kill off processes that hold
706 +   devices open. Hint: if your X server accesses an USB mouse, doing a
707 +   'chvt' to a text console releases the device and you can unload the
708 +   module.
709 +
710 +   Check out the latest script (available on tuxonice.net).
711 +
712 +   e. The userspace user interface.
713 +
714 +   TuxOnIce has very limited support for displaying status if you only apply
715 +   the kernel patch - it can printk messages, but that is all. In addition,
716 +   some of the functions mentioned in this document (such as cancelling a cycle
717 +   or performing interactive debugging) are unavailable. To utilise these
718 +   functions, or simply get a nice display, you need the 'userui' component.
719 +   Userui comes in three flavours, usplash, fbsplash and text. Text should
720 +   work on any console. Usplash and fbsplash require the appropriate
721 +   (distro specific?) support.
722 +
723 +   To utilise a userui, TuxOnIce just needs to be told where to find the
724 +   userspace binary:
725 +
726 +   echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program
727 +
728 +   The hibernate script can do this for you, and a default value for this
729 +   setting can be configured when compiling the kernel. This path is also
730 +   stored in the image header, so if you have an initrd or initramfs, you can
731 +   use the userui during the first part of resuming (prior to the atomic
732 +   restore) by putting the binary in the same path in your initrd/ramfs.
733 +   Alternatively, you can put it in a different location and do an echo
734 +   similar to the above prior to the echo > do_resume. The value saved in the
735 +   image header will then be ignored.
736 +
737 +4. Why not just use the version already in the kernel?
738 +
739 +   The version in the vanilla kernel has a number of drawbacks. The most
740 +   serious of these are:
741 +       - it has a maximum image size of 1/2 total memory;
742 +       - it doesn't allocate storage until after it has snapshotted memory.
743 +         This means that you can't be sure hibernating will work until you
744 +         see it start to write the image;
745 +       - it does not allow you to press escape to cancel a cycle;
746 +       - it does not allow you to press escape to cancel resuming;
747 +       - it does not allow you to automatically swapon a file when
748 +         starting a cycle;
749 +       - it does not allow you to use multiple swap partitions or files;
750 +       - it does not allow you to use ordinary files;
751 +       - it just invalidates an image and continues to boot if you
752 +         accidentally boot the wrong kernel after hibernating;
753 +       - it doesn't support any sort of nice display while hibernating;
754 +       - it is moving toward requiring that you have an initrd/initramfs
755 +         to ever have a hope of resuming (uswsusp). While uswsusp will
756 +         address some of the concerns above, it won't address all of them,
757 +          and will be more complicated to get set up;
758 +        - it doesn't have support for suspend-to-both (write a hibernation
759 +         image, then suspend to ram; I think this is known as ReadySafe
760 +         under M$).
761 +
762 +5. How do you use it?
763 +
764 +   A hibernation cycle can be started directly by doing:
765 +
766 +       echo > /sys/power/tuxonice/do_hibernate
767 +
768 +   In practice, though, you'll probably want to use the hibernate script
769 +   to unload modules, configure the kernel the way you like it and so on.
770 +   In that case, you'd do (as root):
771 +
772 +       hibernate
773 +
774 +   See the hibernate script's man page for more details on the options it
775 +   takes.
776 +
777 +   If you're using the text or splash user interface modules, one feature of
778 +   TuxOnIce that you might find useful is that you can press Escape at any time
779 +   during hibernating, and the process will be aborted.
780 +
781 +   Due to the way hibernation works, this means you'll have your system back and
782 +   perfectly usable almost instantly. The only exception is when it's at the
783 +   very end of writing the image. Then it will need to reload a small (usually
784 +   4-50MBs, depending upon the image characteristics) portion first.
785 +
786 +   Likewise, when resuming, you can press escape and resuming will be aborted.
787 +   The computer will then powerdown again according to settings at that time for
788 +   the powerdown method or rebooting.
789 +
790 +   You can change the settings for powering down while the image is being
791 +   written by pressing 'R' to toggle rebooting and 'O' to toggle between
792 +   suspending to ram and powering down completely).
793 +   
794 +   If you run into problems with resuming, adding the "noresume" option to
795 +   the kernel command line will let you skip the resume step and recover your
796 +   system. This option shouldn't normally be needed, because TuxOnIce modifies
797 +   the image header prior to the atomic restore, and will thus prompt you
798 +   if it detects that you've tried to resume an image before (this flag is
799 +   removed if you press Escape to cancel a resume, so you won't be prompted
800 +   then).
801 +
802 +   Recent kernels (2.6.24 onwards) add support for resuming from a different
803 +   kernel to the one that was hibernated (thanks to Rafael for his work on
804 +   this - I've just embraced and enhanced the support for TuxOnIce). This
805 +   should further reduce the need for you to use the noresume option.
806 +
807 +6. What do all those entries in /sys/power/tuxonice do?
808 +
809 +   /sys/power/tuxonice is the directory which contains files you can use to
810 +   tune and configure TuxOnIce to your liking. The exact contents of
811 +   the directory will depend upon the version of TuxOnIce you're
812 +   running and the options you selected at compile time. In the following
813 +   descriptions, names in brackets refer to compile time options.
814 +   (Note that they're all dependant upon you having selected CONFIG_TUXONICE
815 +   in the first place!).
816 +
817 +   Since the values of these settings can open potential security risks, the
818 +   writeable ones are accessible only to the root user. You may want to
819 +   configure sudo to allow you to invoke your hibernate script as an ordinary
820 +   user.
821
822 +   - checksum/enabled
823 +
824 +   Use cryptoapi hashing routines to verify that Pageset2 pages don't change
825 +   while we're saving the first part of the image, and to get any pages that
826 +   do change resaved in the atomic copy. This should normally not be needed,
827 +   but if you're seeing issues, please enable this. If your issues stop you
828 +   being able to resume, enable this option, hibernate and cancel the cycle
829 +   after the atomic copy is done. If the debugging info shows a non-zero
830 +   number of pages resaved, please report this to Nigel.
831
832 +   - compression/algorithm
833 +
834 +   Set the cryptoapi algorithm used for compressing the image.
835 +
836 +   - compression/expected_compression
837 +
838 +   These values allow you to set an expected compression ratio, which TuxOnice
839 +   will use in calculating whether it meets constraints on the image size. If
840 +   this expected compression ratio is not attained, the hibernation cycle will
841 +   abort, so it is wise to allow some spare. You can see what compression
842 +   ratio is achieved in the logs after hibernating.
843 +
844 +   - debug_info:
845 +  
846 +   This file returns information about your configuration that may be helpful
847 +   in diagnosing problems with hibernating.
848 +
849 +   - do_hibernate:
850 +
851 +   When anything is written to this file, the kernel side of TuxOnIce will
852 +   begin to attempt to write an image to disk and power down. You'll normally
853 +   want to run the hibernate script instead, to get modules unloaded first.
854 +
855 +   - do_resume:
856 +
857 +   When anything is written to this file TuxOnIce will attempt to read and
858 +   restore an image. If there is no image, it will return almost immediately.
859 +   If an image exists, the echo > will never return. Instead, the original
860 +   kernel context will be restored and the original echo > do_hibernate will
861 +   return.
862 +
863 +   - */enabled
864 +
865 +   These option can be used to temporarily disable various parts of TuxOnIce.
866 +
867 +   - extra_pages_allowance
868 +
869 +   When TuxOnIce does its atomic copy, it calls the driver model suspend
870 +   and resume methods. If you have DRI enabled with a driver such as fglrx,
871 +   this can result in the driver allocating a substantial amount of memory
872 +   for storing its state. Extra_pages_allowance tells TuxOnIce how much
873 +   extra memory it should ensure is available for those allocations. If
874 +   your attempts at hibernating end with a message in dmesg indicating that
875 +   insufficient extra pages were allowed, you need to increase this value.
876 +
877 +   - file/target:
878 +
879 +   Read this value to get the current setting. Write to it to point TuxOnice
880 +   at a new storage location for the file allocator. See section 3.b.ii above
881 +   for details of how to set up the file allocator.
882 +
883 +   - freezer_test
884 +
885 +   This entry can be used to get TuxOnIce to just test the freezer and prepare
886 +   an image without actually doing a hibernation cycle. It is useful for
887 +   diagnosing freezing and image preparation issues.
888 +
889 +   - image_exists:
890 +
891 +   Can be used in a script to determine whether a valid image exists at the
892 +   location currently pointed to by resume=. Returns up to three lines.
893 +   The first is whether an image exists (-1 for unsure, otherwise 0 or 1).
894 +   If an image eixsts, additional lines will return the machine and version.
895 +   Echoing anything to this entry removes any current image.
896 +
897 +   - image_size_limit:
898 +
899 +   The maximum size of hibernation image written to disk, measured in megabytes
900 +   (1024*1024).
901 +
902 +   - last_result:
903 +
904 +   The result of the last hibernation cycle, as defined in
905 +   include/linux/suspend-debug.h with the values SUSPEND_ABORTED to
906 +   SUSPEND_KEPT_IMAGE. This is a bitmask.
907 +
908 +   - log_everything (CONFIG_PM_DEBUG):
909 +
910 +   Setting this option results in all messages printed being logged. Normally,
911 +   only a subset are logged, so as to not slow the process and not clutter the
912 +   logs. Useful for debugging. It can be toggled during a cycle by pressing
913 +   'L'.
914 +
915 +   - pause_between_steps (CONFIG_PM_DEBUG):
916 +
917 +   This option is used during debugging, to make TuxOnIce pause between
918 +   each step of the process. It is ignored when the nice display is on.
919 +
920 +   - powerdown_method:
921 +
922 +   Used to select a method by which TuxOnIce should powerdown after writing the
923 +   image. Currently:
924 +
925 +   0: Don't use ACPI to power off.
926 +   3: Attempt to enter Suspend-to-ram.
927 +   4: Attempt to enter ACPI S4 mode.
928 +   5: Attempt to power down via ACPI S5 mode.
929 +
930 +   Note that these options are highly dependant upon your hardware & software:
931 +
932 +   3: When succesful, your machine suspends to ram instead of powering off.
933 +      The advantage of using this mode is that it doesn't matter whether your
934 +      battery has enough charge to make it through to your next resume. If it
935 +      lasts, you will simply resume from suspend to ram (and the image on disk
936 +      will be discarded). If the battery runs out, you will resume from disk
937 +      instead. The disadvantage is that it takes longer than a normal
938 +      suspend-to-ram to enter the state, since the suspend-to-disk image needs
939 +      to be written first.
940 +   4/5: When successful, your machine will be off and comsume (almost) no power.
941 +      But it might still react to some external events like opening the lid or
942 +      trafic on  a network or usb device. For the bios, resume is then the same
943 +      as warm boot, similar to a situation where you used the command `reboot'
944 +      to reboot your machine. If your machine has problems on warm boot or if
945 +      you want to protect your machine with the bios password, this is probably
946 +      not the right choice. Mode 4 may be necessary on some machines where ACPI
947 +      wake up methods need to be run to properly reinitialise hardware after a
948 +      hibernation cycle.  
949 +   0: Switch the machine completely off. The only possible wakeup is the power
950 +      button. For the bios, resume is then the same as a cold boot, in
951 +      particular you would  have to provide your bios boot password if your
952 +      machine uses that feature for booting.
953 +
954 +   - progressbar_granularity_limit:
955 +
956 +   This option can be used to limit the granularity of the progress bar
957 +   displayed with a bootsplash screen. The value is the maximum number of
958 +   steps. That is, 10 will make the progress bar jump in 10% increments.
959 +
960 +   - reboot:
961 +
962 +   This option causes TuxOnIce to reboot rather than powering down
963 +   at the end of saving an image. It can be toggled during a cycle by pressing
964 +   'R'.
965 +
966 +   - resume_commandline:
967 +
968 +   This entry can be read after resuming to see the commandline that was used
969 +   when resuming began. You might use this to set up two bootloader entries
970 +   that are the same apart from the fact that one includes a extra append=
971 +   argument "at_work=1". You could then grep resume_commandline in your
972 +   post-resume scripts and configure networking (for example) differently
973 +   depending upon whether you're at home or work. resume_commandline can be
974 +   set to arbitrary text if you wish to remove sensitive contents.
975 +
976 +   - swap/swapfilename:
977 +
978 +   This entry is used to specify the swapfile or partition that
979 +   TuxOnIce will attempt to swapon/swapoff automatically. Thus, if
980 +   I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically
981 +   for my hibernation image, I would
982 +  
983 +   echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile
984 +
985 +   /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the
986 +   swapon and swapoff occur while other processes are frozen (including kswapd)
987 +   so this swap file will not be used up when attempting to free memory. The
988 +   parition/file is also given the highest priority, so other swapfiles/partitions
989 +   will only be used to save the image when this one is filled.
990 +
991 +   The value of this file is used by headerlocations along with any currently
992 +   activated swapfiles/partitions.
993 +
994 +   - swap/headerlocations:
995 +
996 +   This option tells you the resume= options to use for swap devices you
997 +   currently have activated. It is particularly useful when you only want to
998 +   use a swap file to store your image. See above for further details.
999 +
1000 +   - userui_program
1001 +
1002 +   This entry is used to tell TuxOnice what userspace program to use for
1003 +   providing a user interface while hibernating. The program uses a netlink
1004 +   socket to pass messages back and forward to the kernel, allowing all of the
1005 +   functions formerly implemented in the kernel user interface components.
1006 +
1007 +   - user_interface/debug_sections (CONFIG_PM_DEBUG):
1008 +
1009 +   This value, together with the console log level, controls what debugging
1010 +   information is displayed. The console log level determines the level of
1011 +   detail, and this value determines what detail is displayed. This value is
1012 +   a bit vector, and the meaning of the bits can be found in the kernel tree
1013 +   in include/linux/tuxonice.h. It can be overridden using the kernel's
1014 +   command line option suspend_dbg.
1015 +
1016 +   - user_interface/default_console_level (CONFIG_PM_DEBUG):
1017 +
1018 +   This determines the value of the console log level at the start of a
1019 +   hibernation cycle. If debugging is compiled in, the console log level can be
1020 +   changed during a cycle by pressing the digit keys. Meanings are:
1021 +
1022 +   0: Nice display.
1023 +   1: Nice display plus numerical progress.
1024 +   2: Errors only.
1025 +   3: Low level debugging info.
1026 +   4: Medium level debugging info.
1027 +   5: High level debugging info.
1028 +   6: Verbose debugging info.
1029 +
1030 +   - user_interface/enable_escape:
1031 +
1032 +   Setting this to "1" will enable you abort a hibernation cycle or resuming by
1033 +   pressing escape, "0" (default) disables this feature. Note that enabling
1034 +   this option means that you cannot initiate a hibernation cycle and then walk
1035 +away
1036 +   from your computer, expecting it to be secure. With feature disabled,
1037 +   you can validly have this expectation once TuxOnice begins to write the
1038 +   image to disk. (Prior to this point, it is possible that TuxOnice might
1039 +   about because of failure to freeze all processes or because constraints
1040 +   on its ability to save the image are not met).
1041 +
1042 +   - version:
1043 +  
1044 +   The version of TuxOnIce you have compiled into the currently running kernel.
1045 +
1046 +7. How do you get support?
1047 +
1048 +   Glad you asked. TuxOnIce is being actively maintained and supported
1049 +   by Nigel (the guy doing most of the kernel coding at the moment), Bernard
1050 +   (who maintains the hibernate script and userspace user interface components)
1051 +   and its users.
1052 +
1053 +   Resources availble include HowTos, FAQs and a Wiki, all available via
1054 +   tuxonice.net.  You can find the mailing lists there.
1055 +
1056 +8. I think I've found a bug. What should I do?
1057 +
1058 +   By far and a way, the most common problems people have with TuxOnIce
1059 +   related to drivers not having adequate power management support. In this
1060 +   case, it is not a bug with TuxOnIce, but we can still help you. As we
1061 +   mentioned above, such issues can usually be worked around by building the
1062 +   functionality as modules and unloading them while hibernating. Please visit
1063 +   the Wiki for up-to-date lists of known issues and work arounds.
1064 +
1065 +   If this information doesn't help, try running:
1066 +
1067 +   hibernate --bug-report
1068 +
1069 +   ..and sending the output to the users mailing list.
1070 +
1071 +   Good information on how to provide us with useful information from an
1072 +   oops is found in the file REPORTING-BUGS, in the top level directory
1073 +   of the kernel tree. If you get an oops, please especially note the
1074 +   information about running what is printed on the screen through ksymoops.
1075 +   The raw information is useless.
1076 +
1077 +9. When will XXX be supported?
1078 +
1079 +   If there's a feature missing from TuxOnIce that you'd like, feel free to
1080 +   ask. We try to be obliging, within reason.
1081 +
1082 +   Patches are welcome. Please send to the list.
1083 +
1084 +10. How does it work?
1085 +
1086 +   TuxOnIce does its work in a number of steps.
1087 +
1088 +   a. Freezing system activity.
1089 +
1090 +   The first main stage in hibernating is to stop all other activity. This is
1091 +   achieved in stages. Processes are considered in fours groups, which we will
1092 +   describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE
1093 +   flag, kernel threads without this flag, userspace processes with the
1094 +   PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are
1095 +   untouched by the refrigerator code. They are allowed to run during hibernating
1096 +   and resuming, and are used to support user interaction, storage access or the
1097 +   like. Other kernel threads (those unneeded while hibernating) are frozen last.
1098 +   This leaves us with userspace processes that need to be frozen. When a
1099 +   process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on
1100 +   that process for the duration of that call. Processes that have this flag are
1101 +   frozen after processes without it, so that we can seek to ensure that dirty
1102 +   data is synced to disk as quickly as possible in a situation where other
1103 +   processes may be submitting writes at the same time. Freezing the processes
1104 +   that are submitting data stops new I/O from being submitted. Syncthreads can
1105 +   then cleanly finish their work. So the order is:
1106 +
1107 +   - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE;
1108 +   - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE);
1109 +   - Kernel processes without PF_NOFREEZE.
1110 +
1111 +   b. Eating memory.
1112 +
1113 +   For a successful hibernation cycle, you need to have enough disk space to store the
1114 +   image and enough memory for the various limitations of TuxOnIce's
1115 +   algorithm. You can also specify a maximum image size. In order to attain
1116 +   to those constraints, TuxOnIce may 'eat' memory. If, after freezing
1117 +   processes, the constraints aren't met, TuxOnIce will thaw all the
1118 +   other processes and begin to eat memory until its calculations indicate
1119 +   the constraints are met. It will then freeze processes again and recheck
1120 +   its calculations.
1121 +
1122 +   c. Allocation of storage.
1123 +
1124 +   Next, TuxOnIce allocates the storage that will be used to save
1125 +   the image.
1126 +
1127 +   The core of TuxOnIce knows nothing about how or where pages are stored. We
1128 +   therefore request the active allocator (remember you might have compiled in
1129 +   more than one!) to allocate enough storage for our expect image size. If
1130 +   this request cannot be fulfilled, we eat more memory and try again. If it
1131 +   is fulfiled, we seek to allocate additional storage, just in case our
1132 +   expected compression ratio (if any) isn't achieved. This time, however, we
1133 +   just continue if we can't allocate enough storage.
1134 +
1135 +   If these calls to our allocator change the characteristics of the image
1136 +   such that we haven't allocated enough memory, we also loop. (The allocator
1137 +   may well need to allocate space for its storage information).
1138 +
1139 +   d. Write the first part of the image.
1140 +
1141 +   TuxOnIce stores the image in two sets of pages called 'pagesets'.
1142 +   Pageset 2 contains pages on the active and inactive lists; essentially
1143 +   the page cache. Pageset 1 contains all other pages, including the kernel.
1144 +   We use two pagesets for one important reason: We need to make an atomic copy
1145 +   of the kernel to ensure consistency of the image. Without a second pageset,
1146 +   that would limit us to an image that was at most half the amount of memory
1147 +   available. Using two pagesets allows us to store a full image. Since pageset
1148 +   2 pages won't be needed in saving pageset 1, we first save pageset 2 pages.
1149 +   We can then make our atomic copy of the remaining pages using both pageset 2
1150 +   pages and any other pages that are free. While saving both pagesets, we are
1151 +   careful not to corrupt the image. Among other things, we use lowlevel block
1152 +   I/O routines that don't change the pagecache contents.
1153 +
1154 +   The next step, then, is writing pageset 2.
1155 +
1156 +   e. Suspending drivers and storing processor context.
1157 +
1158 +   Having written pageset2, TuxOnIce calls the power management functions to
1159 +   notify drivers of the hibernation, and saves the processor state in preparation
1160 +   for the atomic copy of memory we are about to make.
1161 +
1162 +   f. Atomic copy.
1163 +
1164 +   At this stage, everything else but the TuxOnIce code is halted. Processes
1165 +   are frozen or idling, drivers are quiesced and have stored (ideally and where
1166 +   necessary) their configuration in memory we are about to atomically copy.
1167 +   In our lowlevel architecture specific code, we have saved the CPU state.
1168 +   We can therefore now do our atomic copy before resuming drivers etc.
1169 +
1170 +   g. Save the atomic copy (pageset 1).
1171 +
1172 +   TuxOnice can then write the atomic copy of the remaining pages. Since we
1173 +   have copied the pages into other locations, we can continue to use the
1174 +   normal block I/O routines without fear of corruption our image.
1175 +
1176 +   f. Save the image header.
1177 +
1178 +   Nearly there! We save our settings and other parameters needed for
1179 +   reloading pageset 1 in an 'image header'. We also tell our allocator to
1180 +   serialise its data at this stage, so that it can reread the image at resume
1181 +   time.
1182 +
1183 +   g. Set the image header.
1184 +
1185 +   Finally, we edit the header at our resume= location. The signature is
1186 +   changed by the allocator to reflect the fact that an image exists, and to
1187 +   point to the start of that data if necessary (swap allocator).
1188 +
1189 +   h. Power down.
1190 +
1191 +   Or reboot if we're debugging and the appropriate option is selected.
1192 +
1193 +   Whew!
1194 +
1195 +   Reloading the image.
1196 +   --------------------
1197 +
1198 +   Reloading the image is essentially the reverse of all the above. We load
1199 +   our copy of pageset 1, being careful to choose locations that aren't going
1200 +   to be overwritten as we copy it back (We start very early in the boot
1201 +   process, so there are no other processes to quiesce here). We then copy
1202 +   pageset 1 back to its original location in memory and restore the process
1203 +   context. We are now running with the original kernel. Next, we reload the
1204 +   pageset 2 pages, free the memory and swap used by TuxOnIce, restore
1205 +   the pageset header and restart processes. Sounds easy in comparison to
1206 +   hibernating, doesn't it!
1207 +
1208 +   There is of course more to TuxOnIce than this, but this explanation
1209 +   should be a good start. If there's interest, I'll write further
1210 +   documentation on range pages and the low level I/O.
1211 +
1212 +11. Who wrote TuxOnIce?
1213 +
1214 +   (Answer based on the writings of Florent Chabaud, credits in files and
1215 +   Nigel's limited knowledge; apologies to anyone missed out!)
1216 +
1217 +   The main developers of TuxOnIce have been...
1218 +
1219 +   Gabor Kuti
1220 +   Pavel Machek
1221 +   Florent Chabaud
1222 +   Bernard Blackham
1223 +   Nigel Cunningham
1224 +
1225 +   Significant portions of swsusp, the code in the vanilla kernel which
1226 +   TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should
1227 +   also be expressed to him.
1228 +
1229 +   The above mentioned developers have been aided in their efforts by a host
1230 +   of hundreds, if not thousands of testers and people who have submitted bug
1231 +   fixes & suggestions. Of special note are the efforts of Michael Frank, who
1232 +   had his computers repetitively hibernate and resume for literally tens of
1233 +   thousands of cycles and developed scripts to stress the system and test
1234 +   TuxOnIce far beyond the point most of us (Nigel included!) would consider
1235 +   testing. His efforts have contributed as much to TuxOnIce as any of the
1236 +   names above.
1237 diff --git a/MAINTAINERS b/MAINTAINERS
1238 index 3596d17..3e7e5ce 100644
1239 --- a/MAINTAINERS
1240 +++ b/MAINTAINERS
1241 @@ -4100,6 +4100,13 @@ P:       Maciej W. Rozycki
1242  M:     macro@linux-mips.org
1243  S:     Maintained
1244  
1245 +TUXONICE (ENHANCED HIBERNATION)
1246 +P:     Nigel Cunningham
1247 +M:     nigel@tuxonice.net
1248 +L:     suspend2-devel@tuxonice.net
1249 +W:     http://tuxonice.net
1250 +S:     Maintained
1251 +
1252  U14-34F SCSI DRIVER
1253  P:     Dario Ballabio
1254  M:     ballabio_dario@emc.com
1255 diff --git a/crypto/Kconfig b/crypto/Kconfig
1256 index d831859..59dbd07 100644
1257 --- a/crypto/Kconfig
1258 +++ b/crypto/Kconfig
1259 @@ -666,6 +666,14 @@ config CRYPTO_LZO
1260         help
1261           This is the LZO algorithm.
1262  
1263 +config CRYPTO_LZF
1264 +       tristate "LZF compression algorithm"
1265 +       default y
1266 +       select CRYPTO_ALGAPI
1267 +       help
1268 +         This is the LZF algorithm. It is especially useful for TuxOnIce,
1269 +         because it achieves good compression quickly.
1270 +
1271  source "drivers/crypto/Kconfig"
1272  
1273  endif  # if CRYPTO
1274 diff --git a/crypto/Makefile b/crypto/Makefile
1275 index d4f3ed8..fe05a9e 100644
1276 --- a/crypto/Makefile
1277 +++ b/crypto/Makefile
1278 @@ -67,6 +67,7 @@ obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
1279  obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
1280  obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
1281  obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
1282 +obj-$(CONFIG_CRYPTO_LZF) += lzf.o
1283  obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o
1284  obj-$(CONFIG_CRYPTO_LZO) += lzo.o
1285  
1286 diff --git a/crypto/lzf.c b/crypto/lzf.c
1287 new file mode 100644
1288 index 0000000..3e0aa8c
1289 --- /dev/null
1290 +++ b/crypto/lzf.c
1291 @@ -0,0 +1,326 @@
1292 +/*
1293 + * Cryptoapi LZF compression module.
1294 + *
1295 + * Copyright (c) 2004-2005 Nigel Cunningham <nigel at tuxonice net>
1296 + *
1297 + * based on the deflate.c file:
1298 + *
1299 + * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
1300 + *
1301 + * and upon the LZF compression module donated to the TuxOnIce project with
1302 + * the following copyright:
1303 + *
1304 + * This program is free software; you can redistribute it and/or modify it
1305 + * under the terms of the GNU General Public License as published by the Free
1306 + * Software Foundation; either version 2 of the License, or (at your option)
1307 + * any later version.
1308 + * Copyright (c) 2000-2003 Marc Alexander Lehmann <pcg@goof.com>
1309 + *
1310 + * Redistribution and use in source and binary forms, with or without modifica-
1311 + * tion, are permitted provided that the following conditions are met:
1312 + *
1313 + *   1.  Redistributions of source code must retain the above copyright notice,
1314 + *       this list of conditions and the following disclaimer.
1315 + *
1316 + *   2.  Redistributions in binary form must reproduce the above copyright
1317 + *       notice, this list of conditions and the following disclaimer in the
1318 + *       documentation and/or other materials provided with the distribution.
1319 + *
1320 + *   3.  The name of the author may not be used to endorse or promote products
1321 + *       derived from this software without specific prior written permission.
1322 + *
1323 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
1324 + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
1325 + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
1326 + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
1327 + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
1328 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
1329 + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
1330 + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
1331 + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
1332 + * OF THE POSSIBILITY OF SUCH DAMAGE.
1333 + *
1334 + * Alternatively, the contents of this file may be used under the terms of
1335 + * the GNU General Public License version 2 (the "GPL"), in which case the
1336 + * provisions of the GPL are applicable instead of the above. If you wish to
1337 + * allow the use of your version of this file only under the terms of the
1338 + * GPL and not to allow others to use your version of this file under the
1339 + * BSD license, indicate your decision by deleting the provisions above and
1340 + * replace them with the notice and other provisions required by the GPL. If
1341 + * you do not delete the provisions above, a recipient may use your version
1342 + * of this file under either the BSD or the GPL.
1343 + */
1344 +
1345 +#include <linux/kernel.h>
1346 +#include <linux/module.h>
1347 +#include <linux/init.h>
1348 +#include <linux/module.h>
1349 +#include <linux/crypto.h>
1350 +#include <linux/err.h>
1351 +#include <linux/vmalloc.h>
1352 +#include <asm/string.h>
1353 +
1354 +struct lzf_ctx {
1355 +       void *hbuf;
1356 +       unsigned int bufofs;
1357 +};
1358 +
1359 +/*
1360 + * size of hashtable is (1 << hlog) * sizeof (char *)
1361 + * decompression is independent of the hash table size
1362 + * the difference between 15 and 14 is very small
1363 + * for small blocks (and 14 is also faster).
1364 + * For a low-memory configuration, use hlog == 13;
1365 + * For best compression, use 15 or 16.
1366 + */
1367 +static const int hlog = 13;
1368 +
1369 +/*
1370 + * don't play with this unless you benchmark!
1371 + * decompression is not dependent on the hash function
1372 + * the hashing function might seem strange, just believe me
1373 + * it works ;)
1374 + */
1375 +static inline u16 first(const u8 *p)
1376 +{
1377 +       return ((p[0]) << 8) + p[1];
1378 +}
1379 +
1380 +static inline u16 next(u8 v, const u8 *p)
1381 +{
1382 +       return ((v) << 8) + p[2];
1383 +}
1384 +
1385 +static inline u32 idx(unsigned int h)
1386 +{
1387 +       return (((h ^ (h << 5)) >> (3*8 - hlog)) + h*3) & ((1 << hlog) - 1);
1388 +}
1389 +
1390 +/*
1391 + * IDX works because it is very similar to a multiplicative hash, e.g.
1392 + * (h * 57321 >> (3*8 - hlog))
1393 + * the next one is also quite good, albeit slow ;)
1394 + * (int)(cos(h & 0xffffff) * 1e6)
1395 + */
1396 +
1397 +static const int max_lit = (1 <<  5);
1398 +static const int max_off = (1 << 13);
1399 +static const int max_ref = ((1 <<  8) + (1 << 3));
1400 +
1401 +/*
1402 + * compressed format
1403 + *
1404 + * 000LLLLL <L+1>    ; literal
1405 + * LLLOOOOO oooooooo ; backref L
1406 + * 111OOOOO LLLLLLLL oooooooo ; backref L+7
1407 + *
1408 + */
1409 +
1410 +static void lzf_compress_exit(struct crypto_tfm *tfm)
1411 +{
1412 +       struct lzf_ctx *ctx = crypto_tfm_ctx(tfm);
1413 +
1414 +       if (!ctx->hbuf)
1415 +               return;
1416 +
1417 +       vfree(ctx->hbuf);
1418 +       ctx->hbuf = NULL;
1419 +}
1420 +
1421 +static int lzf_compress_init(struct crypto_tfm *tfm)
1422 +{
1423 +       struct lzf_ctx *ctx = crypto_tfm_ctx(tfm);
1424 +
1425 +       /* Get LZF ready to go */
1426 +       ctx->hbuf = vmalloc_32((1 << hlog) * sizeof(char *));
1427 +       if (ctx->hbuf)
1428 +               return 0;
1429 +
1430 +       printk(KERN_WARNING "Failed to allocate %ld bytes for lzf workspace\n",
1431 +                       (long) ((1 << hlog) * sizeof(char *)));
1432 +       return -ENOMEM;
1433 +}
1434 +
1435 +static int lzf_compress(struct crypto_tfm *tfm, const u8 *in_data,
1436 +               unsigned int in_len, u8 *out_data, unsigned int *out_len)
1437 +{
1438 +       struct lzf_ctx *ctx = crypto_tfm_ctx(tfm);
1439 +       const u8 **htab = ctx->hbuf;
1440 +       const u8 **hslot;
1441 +       const u8 *ip = in_data;
1442 +       u8 *op = out_data;
1443 +       const u8 *in_end = ip + in_len;
1444 +       u8 *out_end = op + *out_len - 3;
1445 +       const u8 *ref;
1446 +
1447 +       unsigned int hval = first(ip);
1448 +       unsigned long off;
1449 +       int lit = 0;
1450 +
1451 +       memset(htab, 0, sizeof(htab));
1452 +
1453 +       for (;;) {
1454 +               if (ip < in_end - 2) {
1455 +                       hval = next(hval, ip);
1456 +                       hslot = htab + idx(hval);
1457 +                       ref = *hslot;
1458 +                       *hslot = ip;
1459 +
1460 +                       off = ip - ref - 1;
1461 +                       if (off < max_off
1462 +                           && ip + 4 < in_end && ref > in_data
1463 +                           && *(u16 *) ref == *(u16 *) ip && ref[2] == ip[2]
1464 +                           ) {
1465 +                               /* match found at *ref++ */
1466 +                               unsigned int len = 2;
1467 +                               unsigned int maxlen = in_end - ip - len;
1468 +                               maxlen = maxlen > max_ref ? max_ref : maxlen;
1469 +
1470 +                               do
1471 +                                       len++;
1472 +                               while (len < maxlen && ref[len] == ip[len]);
1473 +
1474 +                               if (op + lit + 1 + 3 >= out_end) {
1475 +                                       *out_len = PAGE_SIZE;
1476 +                                       return 0;
1477 +                               }
1478 +
1479 +                               if (lit) {
1480 +                                       *op++ = lit - 1;
1481 +                                       lit = -lit;
1482 +                                       do {
1483 +                                               *op++ = ip[lit];
1484 +                                       } while (++lit);
1485 +                               }
1486 +
1487 +                               len -= 2;
1488 +                               ip++;
1489 +
1490 +                               if (len < 7) {
1491 +                                       *op++ = (off >> 8) + (len << 5);
1492 +                               } else {
1493 +                                       *op++ = (off >> 8) + (7 << 5);
1494 +                                       *op++ = len - 7;
1495 +                               }
1496 +
1497 +                               *op++ = off;
1498 +
1499 +                               ip += len;
1500 +                               hval = first(ip);
1501 +                               hval = next(hval, ip);
1502 +                               htab[idx(hval)] = ip;
1503 +                               ip++;
1504 +                               continue;
1505 +                       }
1506 +               } else if (ip == in_end)
1507 +                       break;
1508 +
1509 +               /* one more literal byte we must copy */
1510 +               lit++;
1511 +               ip++;
1512 +
1513 +               if (lit == max_lit) {
1514 +                       if (op + 1 + max_lit >= out_end) {
1515 +                               *out_len = PAGE_SIZE;
1516 +                               return 0;
1517 +                       }
1518 +
1519 +                       *op++ = max_lit - 1;
1520 +                       memcpy(op, ip - max_lit, max_lit);
1521 +                       op += max_lit;
1522 +                       lit = 0;
1523 +               }
1524 +       }
1525 +
1526 +       if (lit) {
1527 +               if (op + lit + 1 >= out_end) {
1528 +                       *out_len = PAGE_SIZE;
1529 +                       return 0;
1530 +               }
1531 +
1532 +               *op++ = lit - 1;
1533 +               lit = -lit;
1534 +               do {
1535 +                       *op++ = ip[lit];
1536 +               } while (++lit);
1537 +       }
1538 +
1539 +       *out_len = op - out_data;
1540 +       return 0;
1541 +}
1542 +
1543 +static int lzf_decompress(struct crypto_tfm *tfm, const u8 *src,
1544 +               unsigned int slen, u8 *dst, unsigned int *dlen)
1545 +{
1546 +       u8 const *ip = src;
1547 +       u8 *op = dst;
1548 +       u8 const *const in_end = ip + slen;
1549 +       u8 *const out_end = op + *dlen;
1550 +
1551 +       *dlen = PAGE_SIZE;
1552 +       do {
1553 +               unsigned int ctrl = *ip++;
1554 +
1555 +               if (ctrl < (1 << 5)) {
1556 +                       /* literal run */
1557 +                       ctrl++;
1558 +
1559 +                       if (op + ctrl > out_end)
1560 +                               return 0;
1561 +                       memcpy(op, ip, ctrl);
1562 +                       op += ctrl;
1563 +                       ip += ctrl;
1564 +               } else {        /* back reference */
1565 +
1566 +                       unsigned int len = ctrl >> 5;
1567 +
1568 +                       u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
1569 +
1570 +                       if (len == 7)
1571 +                               len += *ip++;
1572 +
1573 +                       ref -= *ip++;
1574 +                       len += 2;
1575 +
1576 +                       if (op + len > out_end || ref < (u8 *) dst)
1577 +                               return 0;
1578 +
1579 +                       do {
1580 +                               *op++ = *ref++;
1581 +                       } while (--len);
1582 +               }
1583 +       } while (op < out_end && ip < in_end);
1584 +
1585 +       *dlen = op - (u8 *) dst;
1586 +       return 0;
1587 +}
1588 +
1589 +static struct crypto_alg alg = {
1590 +       .cra_name = "lzf",
1591 +       .cra_flags = CRYPTO_ALG_TYPE_COMPRESS,
1592 +       .cra_ctxsize = sizeof(struct lzf_ctx),
1593 +       .cra_module = THIS_MODULE,
1594 +       .cra_list = LIST_HEAD_INIT(alg.cra_list),
1595 +       .cra_init = lzf_compress_init,
1596 +       .cra_exit = lzf_compress_exit,
1597 +       .cra_u = { .compress = {
1598 +       .coa_compress = lzf_compress,
1599 +       .coa_decompress = lzf_decompress } }
1600 +};
1601 +
1602 +static int __init init(void)
1603 +{
1604 +       return crypto_register_alg(&alg);
1605 +}
1606 +
1607 +static void __exit fini(void)
1608 +{
1609 +       crypto_unregister_alg(&alg);
1610 +}
1611 +
1612 +module_init(init);
1613 +module_exit(fini);
1614 +
1615 +MODULE_LICENSE("GPL");
1616 +MODULE_DESCRIPTION("LZF Compression Algorithm");
1617 +MODULE_AUTHOR("Marc Alexander Lehmann & Nigel Cunningham");
1618 diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
1619 index 273a944..37c1e9f 100644
1620 --- a/drivers/base/power/main.c
1621 +++ b/drivers/base/power/main.c
1622 @@ -54,6 +54,7 @@ void device_pm_lock(void)
1623  {
1624         mutex_lock(&dpm_list_mtx);
1625  }
1626 +EXPORT_SYMBOL(device_pm_lock);
1627  
1628  /**
1629   *     device_pm_unlock - unlock the list of active devices used by the PM core
1630 @@ -62,6 +63,7 @@ void device_pm_unlock(void)
1631  {
1632         mutex_unlock(&dpm_list_mtx);
1633  }
1634 +EXPORT_SYMBOL(device_pm_unlock);
1635  
1636  /**
1637   *     device_pm_add - add a device to the list of active devices
1638 diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
1639 index d524dc2..681972e 100644
1640 --- a/drivers/macintosh/via-pmu.c
1641 +++ b/drivers/macintosh/via-pmu.c
1642 @@ -40,7 +40,6 @@
1643  #include <linux/interrupt.h>
1644  #include <linux/device.h>
1645  #include <linux/sysdev.h>
1646 -#include <linux/freezer.h>
1647  #include <linux/syscalls.h>
1648  #include <linux/suspend.h>
1649  #include <linux/cpu.h>
1650 diff --git a/drivers/md/md.c b/drivers/md/md.c
1651 index deeac4b..cbf1e49 100644
1652 --- a/drivers/md/md.c
1653 +++ b/drivers/md/md.c
1654 @@ -5899,6 +5899,8 @@ void md_do_sync(mddev_t *mddev)
1655                         last_mark = next;
1656                 }
1657  
1658 +               while (freezer_is_on())
1659 +                       yield();
1660  
1661                 if (kthread_should_stop())
1662                         goto interrupted;
1663 diff --git a/fs/buffer.c b/fs/buffer.c
1664 index ac78d4c..3a927e2 100644
1665 --- a/fs/buffer.c
1666 +++ b/fs/buffer.c
1667 @@ -247,6 +247,91 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb)
1668  }
1669  EXPORT_SYMBOL(thaw_bdev);
1670  
1671 +#if 0
1672 +#define FS_PRINTK(fmt, args...) printk(fmt, ## args)
1673 +#else
1674 +#define FS_PRINTK(fmt, args...)
1675 +#endif
1676 +
1677 +/* #define DEBUG_FS_FREEZING */
1678 +
1679 +/**
1680 + * freeze_filesystems - lock all filesystems and force them into a consistent
1681 + * state
1682 + */
1683 +void freeze_filesystems(int which)
1684 +{
1685 +       struct super_block *sb;
1686 +
1687 +       lockdep_off();
1688 +
1689 +       /*
1690 +        * Freeze in reverse order so filesystems dependant upon others are
1691 +        * frozen in the right order (eg. loopback on ext3).
1692 +        */
1693 +       list_for_each_entry_reverse(sb, &super_blocks, s_list) {
1694 +               FS_PRINTK(KERN_INFO "Considering %s.%s: (root %p, bdev %x)",
1695 +                       sb->s_type->name ? sb->s_type->name : "?",
1696 +                       sb->s_subtype ? sb->s_subtype : "", sb->s_root,
1697 +                       sb->s_bdev ? sb->s_bdev->bd_dev : 0);
1698 +
1699 +               if (sb->s_type->fs_flags & FS_IS_FUSE &&
1700 +                   sb->s_frozen == SB_UNFROZEN &&
1701 +                   which & FS_FREEZER_FUSE) {
1702 +                       sb->s_frozen = SB_FREEZE_TRANS;
1703 +                       sb->s_flags |= MS_FROZEN;
1704 +                       printk("Fuse filesystem done.\n");
1705 +                       continue;
1706 +               }
1707 +
1708 +               if (!sb->s_root || !sb->s_bdev ||
1709 +                   (sb->s_frozen == SB_FREEZE_TRANS) ||
1710 +                   (sb->s_flags & MS_RDONLY) ||
1711 +                   (sb->s_flags & MS_FROZEN) ||
1712 +                   !(which & FS_FREEZER_NORMAL)) {
1713 +                       FS_PRINTK(KERN_INFO "Nope.\n");
1714 +                       continue;
1715 +               }
1716 +
1717 +               FS_PRINTK(KERN_INFO "Freezing %x... ", sb->s_bdev->bd_dev);
1718 +               freeze_bdev(sb->s_bdev);
1719 +               sb->s_flags |= MS_FROZEN;
1720 +               FS_PRINTK(KERN_INFO "Done.\n");
1721 +       }
1722 +
1723 +       lockdep_on();
1724 +}
1725 +
1726 +/**
1727 + * thaw_filesystems - unlock all filesystems
1728 + */
1729 +void thaw_filesystems(int which)
1730 +{
1731 +       struct super_block *sb;
1732 +
1733 +       lockdep_off();
1734 +
1735 +       list_for_each_entry(sb, &super_blocks, s_list) {
1736 +               if (!(sb->s_flags & MS_FROZEN))
1737 +                       continue;
1738 +
1739 +               if (sb->s_type->fs_flags & FS_IS_FUSE) {
1740 +                       if (!(which & FS_FREEZER_FUSE))
1741 +                               continue;
1742 +
1743 +                       sb->s_frozen = SB_UNFROZEN;
1744 +               } else {
1745 +                       if (!(which & FS_FREEZER_NORMAL))
1746 +                               continue;
1747 +
1748 +                       thaw_bdev(sb->s_bdev, sb);
1749 +               }
1750 +               sb->s_flags &= ~MS_FROZEN;
1751 +       }
1752 +
1753 +       lockdep_on();
1754 +}
1755 +
1756  /*
1757   * Various filesystems appear to want __find_get_block to be non-blocking.
1758   * But it's the page lock which protects the buffers.  To get around this,
1759 diff --git a/fs/drop_caches.c b/fs/drop_caches.c
1760 index 3e5637f..0d4c88e 100644
1761 --- a/fs/drop_caches.c
1762 +++ b/fs/drop_caches.c
1763 @@ -33,7 +33,7 @@ static void drop_pagecache_sb(struct super_block *sb)
1764         iput(toput_inode);
1765  }
1766  
1767 -static void drop_pagecache(void)
1768 +void drop_pagecache(void)
1769  {
1770         struct super_block *sb;
1771  
1772 diff --git a/fs/fuse/control.c b/fs/fuse/control.c
1773 index 4f3cab3..f15b0c5 100644
1774 --- a/fs/fuse/control.c
1775 +++ b/fs/fuse/control.c
1776 @@ -207,6 +207,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
1777  static struct file_system_type fuse_ctl_fs_type = {
1778         .owner          = THIS_MODULE,
1779         .name           = "fusectl",
1780 +       .fs_flags       = FS_IS_FUSE,
1781         .get_sb         = fuse_ctl_get_sb,
1782         .kill_sb        = fuse_ctl_kill_sb,
1783  };
1784 diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
1785 index 87250b6..7246e3d 100644
1786 --- a/fs/fuse/dev.c
1787 +++ b/fs/fuse/dev.c
1788 @@ -7,6 +7,7 @@
1789  */
1790  
1791  #include "fuse_i.h"
1792 +#include "fuse.h"
1793  
1794  #include <linux/init.h>
1795  #include <linux/module.h>
1796 @@ -16,6 +17,7 @@
1797  #include <linux/pagemap.h>
1798  #include <linux/file.h>
1799  #include <linux/slab.h>
1800 +#include <linux/freezer.h>
1801  
1802  MODULE_ALIAS_MISCDEV(FUSE_MINOR);
1803  
1804 @@ -743,6 +745,8 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1805         if (!fc)
1806                 return -EPERM;
1807  
1808 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_dev_read");
1809 +
1810   restart:
1811         spin_lock(&fc->lock);
1812         err = -EAGAIN;
1813 @@ -869,6 +873,9 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1814         if (!fc)
1815                 return -EPERM;
1816  
1817 +       FUSE_MIGHT_FREEZE(iocb->ki_filp->f_mapping->host->i_sb,
1818 +                       "fuse_dev_write");
1819 +
1820         fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1821         if (nbytes < sizeof(struct fuse_out_header))
1822                 return -EINVAL;
1823 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
1824 index fd03330..347e054 100644
1825 --- a/fs/fuse/dir.c
1826 +++ b/fs/fuse/dir.c
1827 @@ -7,12 +7,14 @@
1828  */
1829  
1830  #include "fuse_i.h"
1831 +#include "fuse.h"
1832  
1833  #include <linux/pagemap.h>
1834  #include <linux/file.h>
1835  #include <linux/gfp.h>
1836  #include <linux/sched.h>
1837  #include <linux/namei.h>
1838 +#include <linux/freezer.h>
1839  
1840  #if BITS_PER_LONG >= 64
1841  static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
1842 @@ -174,6 +176,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
1843                         return 0;
1844  
1845                 fc = get_fuse_conn(inode);
1846 +
1847 +               FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_dentry_revalidate");
1848 +
1849                 req = fuse_get_req(fc);
1850                 if (IS_ERR(req))
1851                         return 0;
1852 @@ -273,6 +278,8 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
1853         if (IS_ERR(req))
1854                 goto out;
1855  
1856 +       FUSE_MIGHT_FREEZE(sb, "fuse_lookup");
1857 +
1858         forget_req = fuse_get_req(fc);
1859         err = PTR_ERR(forget_req);
1860         if (IS_ERR(forget_req)) {
1861 @@ -402,6 +409,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
1862         if (IS_ERR(forget_req))
1863                 return PTR_ERR(forget_req);
1864  
1865 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_create_open");
1866 +
1867         req = fuse_get_req(fc);
1868         err = PTR_ERR(req);
1869         if (IS_ERR(req))
1870 @@ -488,6 +497,8 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
1871         int err;
1872         struct fuse_req *forget_req;
1873  
1874 +       FUSE_MIGHT_FREEZE(dir->i_sb, "create_new_entry");
1875 +
1876         forget_req = fuse_get_req(fc);
1877         if (IS_ERR(forget_req)) {
1878                 fuse_put_request(fc, req);
1879 @@ -585,7 +596,11 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
1880  {
1881         struct fuse_mkdir_in inarg;
1882         struct fuse_conn *fc = get_fuse_conn(dir);
1883 -       struct fuse_req *req = fuse_get_req(fc);
1884 +       struct fuse_req *req;
1885 +
1886 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_mkdir");
1887 +
1888 +       req = fuse_get_req(fc);
1889         if (IS_ERR(req))
1890                 return PTR_ERR(req);
1891  
1892 @@ -605,7 +620,11 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
1893  {
1894         struct fuse_conn *fc = get_fuse_conn(dir);
1895         unsigned len = strlen(link) + 1;
1896 -       struct fuse_req *req = fuse_get_req(fc);
1897 +       struct fuse_req *req;
1898 +
1899 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_symlink");
1900 +
1901 +       req = fuse_get_req(fc);
1902         if (IS_ERR(req))
1903                 return PTR_ERR(req);
1904  
1905 @@ -622,7 +641,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
1906  {
1907         int err;
1908         struct fuse_conn *fc = get_fuse_conn(dir);
1909 -       struct fuse_req *req = fuse_get_req(fc);
1910 +       struct fuse_req *req;
1911 +
1912 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_unlink");
1913 +
1914 +       req = fuse_get_req(fc);
1915         if (IS_ERR(req))
1916                 return PTR_ERR(req);
1917  
1918 @@ -653,7 +676,11 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
1919  {
1920         int err;
1921         struct fuse_conn *fc = get_fuse_conn(dir);
1922 -       struct fuse_req *req = fuse_get_req(fc);
1923 +       struct fuse_req *req;
1924 +
1925 +       FUSE_MIGHT_FREEZE(dir->i_sb, "fuse_rmdir");
1926 +
1927 +       req = fuse_get_req(fc);
1928         if (IS_ERR(req))
1929                 return PTR_ERR(req);
1930  
1931 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
1932 index 2bada6b..2e4af49 100644
1933 --- a/fs/fuse/file.c
1934 +++ b/fs/fuse/file.c
1935 @@ -7,11 +7,13 @@
1936  */
1937  
1938  #include "fuse_i.h"
1939 +#include "fuse.h"
1940  
1941  #include <linux/pagemap.h>
1942  #include <linux/slab.h>
1943  #include <linux/kernel.h>
1944  #include <linux/sched.h>
1945 +#include <linux/freezer.h>
1946  
1947  static const struct file_operations fuse_direct_io_file_operations;
1948  
1949 @@ -23,6 +25,8 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
1950         struct fuse_req *req;
1951         int err;
1952  
1953 +       FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_send_open");
1954 +
1955         req = fuse_get_req(fc);
1956         if (IS_ERR(req))
1957                 return PTR_ERR(req);
1958 @@ -674,6 +678,8 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
1959         if (is_bad_inode(inode))
1960                 return -EIO;
1961  
1962 +       FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_commit_write");
1963 +
1964         /*
1965          * Make sure writepages on the same page are not mixed up with
1966          * plain writes.
1967 @@ -962,6 +968,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1968         if (is_bad_inode(inode))
1969                 return -EIO;
1970  
1971 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_direct_io");
1972 +
1973         req = fuse_get_req(fc);
1974         if (IS_ERR(req))
1975                 return PTR_ERR(req);
1976 @@ -1315,6 +1323,8 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
1977         struct fuse_lk_out outarg;
1978         int err;
1979  
1980 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_getlk");
1981 +
1982         req = fuse_get_req(fc);
1983         if (IS_ERR(req))
1984                 return PTR_ERR(req);
1985 @@ -1350,6 +1360,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
1986         if (fl->fl_flags & FL_CLOSE)
1987                 return 0;
1988  
1989 +       FUSE_MIGHT_FREEZE(file->f_mapping->host->i_sb, "fuse_setlk");
1990 +
1991         req = fuse_get_req(fc);
1992         if (IS_ERR(req))
1993                 return PTR_ERR(req);
1994 @@ -1416,6 +1428,8 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1995         if (!inode->i_sb->s_bdev || fc->no_bmap)
1996                 return 0;
1997  
1998 +       FUSE_MIGHT_FREEZE(inode->i_sb, "fuse_bmap");
1999 +
2000         req = fuse_get_req(fc);
2001         if (IS_ERR(req))
2002                 return 0;
2003 diff --git a/fs/fuse/fuse.h b/fs/fuse/fuse.h
2004 new file mode 100644
2005 index 0000000..170e49a
2006 --- /dev/null
2007 +++ b/fs/fuse/fuse.h
2008 @@ -0,0 +1,13 @@
2009 +#define FUSE_MIGHT_FREEZE(superblock, desc) \
2010 +do { \
2011 +       int printed = 0; \
2012 +       while (superblock->s_frozen != SB_UNFROZEN) { \
2013 +               if (!printed) { \
2014 +                       printk(KERN_INFO "%d frozen in " desc ".\n", \
2015 +                                               current->pid); \
2016 +                       printed = 1; \
2017 +               } \
2018 +               try_to_freeze(); \
2019 +               yield(); \
2020 +       } \
2021 +} while (0)
2022 diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
2023 index d2249f1..04ae6cb 100644
2024 --- a/fs/fuse/inode.c
2025 +++ b/fs/fuse/inode.c
2026 @@ -914,7 +914,7 @@ static int fuse_get_sb(struct file_system_type *fs_type,
2027  static struct file_system_type fuse_fs_type = {
2028         .owner          = THIS_MODULE,
2029         .name           = "fuse",
2030 -       .fs_flags       = FS_HAS_SUBTYPE,
2031 +       .fs_flags       = FS_HAS_SUBTYPE | FS_IS_FUSE,
2032         .get_sb         = fuse_get_sb,
2033         .kill_sb        = kill_anon_super,
2034  };
2035 @@ -933,7 +933,7 @@ static struct file_system_type fuseblk_fs_type = {
2036         .name           = "fuseblk",
2037         .get_sb         = fuse_get_sb_blk,
2038         .kill_sb        = kill_block_super,
2039 -       .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
2040 +       .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_IS_FUSE,
2041  };
2042  
2043  static inline int register_fuseblk(void)
2044 diff --git a/fs/ioctl.c b/fs/ioctl.c
2045 index 7db32b3..be49ee9 100644
2046 --- a/fs/ioctl.c
2047 +++ b/fs/ioctl.c
2048 @@ -211,3 +211,4 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
2049   out:
2050         return error;
2051  }
2052 +EXPORT_SYMBOL(sys_ioctl);
2053 diff --git a/fs/namei.c b/fs/namei.c
2054 index 4ea63ed..65be6a6 100644
2055 --- a/fs/namei.c
2056 +++ b/fs/namei.c
2057 @@ -2223,6 +2223,8 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2058         if (!dir->i_op || !dir->i_op->unlink)
2059                 return -EPERM;
2060  
2061 +       vfs_check_frozen(dir->i_sb, SB_FREEZE_WRITE);
2062 +
2063         DQUOT_INIT(dir);
2064  
2065         mutex_lock(&dentry->d_inode->i_mutex);
2066 diff --git a/include/linux/Kbuild b/include/linux/Kbuild
2067 index b68ec09..6eebd34 100644
2068 --- a/include/linux/Kbuild
2069 +++ b/include/linux/Kbuild
2070 @@ -208,6 +208,7 @@ unifdef-y += filter.h
2071  unifdef-y += flat.h
2072  unifdef-y += futex.h
2073  unifdef-y += fs.h
2074 +unifdef-y += freezer.h
2075  unifdef-y += gameport.h
2076  unifdef-y += generic_serial.h
2077  unifdef-y += hayesesp.h
2078 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
2079 index eadaab4..7eb6655 100644
2080 --- a/include/linux/buffer_head.h
2081 +++ b/include/linux/buffer_head.h
2082 @@ -171,6 +171,11 @@ wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
2083  int fsync_bdev(struct block_device *);
2084  struct super_block *freeze_bdev(struct block_device *);
2085  void thaw_bdev(struct block_device *, struct super_block *);
2086 +#define FS_FREEZER_FUSE 1
2087 +#define FS_FREEZER_NORMAL 2
2088 +#define FS_FREEZER_ALL (FS_FREEZER_FUSE | FS_FREEZER_NORMAL)
2089 +void freeze_filesystems(int which);
2090 +void thaw_filesystems(int which);
2091  int fsync_super(struct super_block *);
2092  int fsync_no_super(struct block_device *);
2093  struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
2094 diff --git a/include/linux/dyn_pageflags.h b/include/linux/dyn_pageflags.h
2095 new file mode 100644
2096 index 0000000..e85c3ee
2097 --- /dev/null
2098 +++ b/include/linux/dyn_pageflags.h
2099 @@ -0,0 +1,66 @@
2100 +/*
2101 + * include/linux/dyn_pageflags.h
2102 + *
2103 + * Copyright (C) 2004-2007 Nigel Cunningham <nigel at tuxonice net>
2104 + *
2105 + * This file is released under the GPLv2.
2106 + *
2107 + * It implements support for dynamically allocated bitmaps that are
2108 + * used for temporary or infrequently used pageflags, in lieu of
2109 + * bits in the struct page flags entry.
2110 + */
2111 +
2112 +#ifndef DYN_PAGEFLAGS_H
2113 +#define DYN_PAGEFLAGS_H
2114 +
2115 +#include <linux/mm.h>
2116 +
2117 +struct dyn_pageflags {
2118 +       unsigned long ****bitmap; /* [pg_dat][zone][page_num] */
2119 +       int sparse, initialised;
2120 +       struct list_head list;
2121 +       spinlock_t struct_lock;
2122 +};
2123 +
2124 +#define DYN_PAGEFLAGS_INIT(name) { \
2125 +       .list = LIST_HEAD_INIT(name.list), \
2126 +       .struct_lock = __SPIN_LOCK_UNLOCKED(name.lock) \
2127 +}
2128 +
2129 +#define DECLARE_DYN_PAGEFLAGS(name) \
2130 +       struct dyn_pageflags name = DYN_PAGEFLAGS_INIT(name);
2131 +
2132 +#define BITMAP_FOR_EACH_SET(BITMAP, CTR) \
2133 +       for (CTR = get_next_bit_on(BITMAP, max_pfn + 1); CTR <= max_pfn; \
2134 +               CTR = get_next_bit_on(BITMAP, CTR))
2135 +
2136 +extern void clear_dyn_pageflags(struct dyn_pageflags *pagemap);
2137 +extern int allocate_dyn_pageflags(struct dyn_pageflags *pagemap, int sparse);
2138 +extern void free_dyn_pageflags(struct dyn_pageflags *pagemap);
2139 +extern unsigned long get_next_bit_on(struct dyn_pageflags *bitmap,
2140 +       unsigned long counter);
2141 +
2142 +extern int test_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
2143 +/*
2144 + * In sparse bitmaps, setting a flag can fail (we can fail to allocate
2145 + * the page to store the bit. If this happens, we will BUG(). If you don't
2146 + * want this behaviour, don't allocate sparse pageflags.
2147 + */
2148 +extern void set_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
2149 +extern void clear_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
2150 +extern void dump_pagemap(struct dyn_pageflags *pagemap);
2151 +
2152 +/*
2153 + * With the above macros defined, you can do...
2154 + * #define PagePageset1(page) (test_dynpageflag(&pageset1_map, page))
2155 + * #define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page))
2156 + * #define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page))
2157 + */
2158 +
2159 +extern void __init dyn_pageflags_init(void);
2160 +extern void __init dyn_pageflags_use_kzalloc(void);
2161 +
2162 +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
2163 +extern void dyn_pageflags_hotplug(struct zone *zone);
2164 +#endif
2165 +#endif
2166 diff --git a/include/linux/freezer.h b/include/linux/freezer.h
2167 index deddeed..4d92953 100644
2168 --- a/include/linux/freezer.h
2169 +++ b/include/linux/freezer.h
2170 @@ -127,6 +127,19 @@ static inline void set_freezable(void)
2171         current->flags &= ~PF_NOFREEZE;
2172  }
2173  
2174 +extern int freezer_state;
2175 +#define FREEZER_OFF 0
2176 +#define FREEZER_FILESYSTEMS_FROZEN 1
2177 +#define FREEZER_USERSPACE_FROZEN 2
2178 +#define FREEZER_FULLY_ON 3
2179 +
2180 +static inline int freezer_is_on(void)
2181 +{
2182 +       return (freezer_state == FREEZER_FULLY_ON);
2183 +}
2184 +
2185 +extern void thaw_kernel_threads(void);
2186 +
2187  /*
2188   * Tell the freezer that the current task should be frozen by it and that it
2189   * should send a fake signal to the task to freeze it.
2190 @@ -178,6 +191,8 @@ static inline int freeze_processes(void) { BUG(); return 0; }
2191  static inline void thaw_processes(void) {}
2192  
2193  static inline int try_to_freeze(void) { return 0; }
2194 +static inline int freezer_is_on(void) { return 0; }
2195 +static inline void thaw_kernel_threads(void) { }
2196  
2197  static inline void freezer_do_not_count(void) {}
2198  static inline void freezer_count(void) {}
2199 diff --git a/include/linux/fs.h b/include/linux/fs.h
2200 index 580b513..e7a3169 100644
2201 --- a/include/linux/fs.h
2202 +++ b/include/linux/fs.h
2203 @@ -8,6 +8,7 @@
2204  
2205  #include <linux/limits.h>
2206  #include <linux/ioctl.h>
2207 +#include <linux/freezer.h>
2208  
2209  /*
2210   * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
2211 @@ -96,6 +97,7 @@ extern int dir_notify_enable;
2212  #define FS_REQUIRES_DEV 1 
2213  #define FS_BINARY_MOUNTDATA 2
2214  #define FS_HAS_SUBTYPE 4
2215 +#define FS_IS_FUSE     8       /* Fuse filesystem - bdev freeze these too */
2216  #define FS_REVAL_DOT   16384   /* Check the paths ".", ".." for staleness */
2217  #define FS_RENAME_DOES_D_MOVE  32768   /* FS will handle d_move()
2218                                          * during rename() internally.
2219 @@ -128,6 +130,7 @@ extern int dir_notify_enable;
2220  #define MS_RELATIME    (1<<21) /* Update atime relative to mtime/ctime. */
2221  #define MS_KERNMOUNT   (1<<22) /* this is a kern_mount call */
2222  #define MS_I_VERSION   (1<<23) /* Update inode I_version field */
2223 +#define MS_FROZEN      (1<<24) /* Frozen by freeze_filesystems() */
2224  #define MS_ACTIVE      (1<<30)
2225  #define MS_NOUSER      (1<<31)
2226  
2227 @@ -1141,8 +1144,11 @@ enum {
2228         SB_FREEZE_TRANS = 2,
2229  };
2230  
2231 -#define vfs_check_frozen(sb, level) \
2232 -       wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
2233 +#define vfs_check_frozen(sb, level) do { \
2234 +       freezer_do_not_count(); \
2235 +       wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))); \
2236 +       freezer_count(); \
2237 +} while (0)
2238  
2239  #define get_fs_excl() atomic_inc(&current->fs_excl)
2240  #define put_fs_excl() atomic_dec(&current->fs_excl)
2241 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
2242 index 2651f80..c50279d 100644
2243 --- a/include/linux/kernel.h
2244 +++ b/include/linux/kernel.h
2245 @@ -165,6 +165,8 @@ extern int vsprintf(char *buf, const char *, va_list)
2246         __attribute__ ((format (printf, 2, 0)));
2247  extern int snprintf(char * buf, size_t size, const char * fmt, ...)
2248         __attribute__ ((format (printf, 3, 4)));
2249 +extern int snprintf_used(char *buffer, int buffer_size,
2250 +               const char *fmt, ...);
2251  extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
2252         __attribute__ ((format (printf, 3, 0)));
2253  extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
2254 diff --git a/include/linux/mm.h b/include/linux/mm.h
2255 index 72a15dc..01a7657 100644
2256 --- a/include/linux/mm.h
2257 +++ b/include/linux/mm.h
2258 @@ -1264,6 +1264,7 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
2259                                         void __user *, size_t *, loff_t *);
2260  unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
2261                         unsigned long lru_pages);
2262 +void drop_pagecache(void);
2263  
2264  #ifndef CONFIG_MMU
2265  #define randomize_va_space 0
2266 diff --git a/include/linux/netlink.h b/include/linux/netlink.h
2267 index 9ff1b54..100dc2e 100644
2268 --- a/include/linux/netlink.h
2269 +++ b/include/linux/netlink.h
2270 @@ -24,6 +24,8 @@
2271  /* leave room for NETLINK_DM (DM Events) */
2272  #define NETLINK_SCSITRANSPORT  18      /* SCSI Transports */
2273  #define NETLINK_ECRYPTFS       19
2274 +#define NETLINK_TOI_USERUI     20      /* TuxOnIce's userui */
2275 +#define NETLINK_TOI_USM                21      /* Userspace storage manager */
2276  
2277  #define MAX_LINKS 32           
2278  
2279 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
2280 index 2ce8207..374e892 100644
2281 --- a/include/linux/suspend.h
2282 +++ b/include/linux/suspend.h
2283 @@ -280,4 +280,69 @@ static inline void register_nosave_region_late(unsigned long b, unsigned long e)
2284  
2285  extern struct mutex pm_mutex;
2286  
2287 +enum {
2288 +       TOI_CAN_HIBERNATE,
2289 +       TOI_CAN_RESUME,
2290 +       TOI_RESUME_DEVICE_OK,
2291 +       TOI_NORESUME_SPECIFIED,
2292 +       TOI_SANITY_CHECK_PROMPT,
2293 +       TOI_CONTINUE_REQ,
2294 +       TOI_RESUMED_BEFORE,
2295 +       TOI_BOOT_TIME,
2296 +       TOI_NOW_RESUMING,
2297 +       TOI_IGNORE_LOGLEVEL,
2298 +       TOI_TRYING_TO_RESUME,
2299 +       TOI_LOADING_ALT_IMAGE,
2300 +       TOI_STOP_RESUME,
2301 +       TOI_IO_STOPPED,
2302 +       TOI_NOTIFIERS_PREPARE,
2303 +       TOI_CLUSTER_MODE,
2304 +};
2305 +
2306 +#ifdef CONFIG_TOI
2307 +
2308 +/* Used in init dir files */
2309 +extern unsigned long toi_state;
2310 +#define set_toi_state(bit) (set_bit(bit, &toi_state))
2311 +#define clear_toi_state(bit) (clear_bit(bit, &toi_state))
2312 +#define test_toi_state(bit) (test_bit(bit, &toi_state))
2313 +extern int toi_running;
2314 +
2315 +#else /* !CONFIG_TOI */
2316 +
2317 +#define toi_state              (0)
2318 +#define set_toi_state(bit) do { } while (0)
2319 +#define clear_toi_state(bit) do { } while (0)
2320 +#define test_toi_state(bit) (0)
2321 +#define toi_running (0)
2322 +#endif /* CONFIG_TOI */
2323 +
2324 +#ifdef CONFIG_HIBERNATION
2325 +#ifdef CONFIG_TOI
2326 +extern void toi_try_resume(void);
2327 +#else
2328 +#define toi_try_resume() do { } while (0)
2329 +#endif
2330 +
2331 +extern int resume_attempted;
2332 +extern int software_resume(void);
2333 +
2334 +static inline void check_resume_attempted(void)
2335 +{
2336 +       if (resume_attempted)
2337 +               return;
2338 +
2339 +       software_resume();
2340 +}
2341 +#else
2342 +#define check_resume_attempted() do { } while (0)
2343 +#define resume_attempted (0)
2344 +#endif
2345 +
2346 +#ifdef CONFIG_PRINTK_NOSAVE
2347 +#define POSS_NOSAVE __nosavedata
2348 +#else
2349 +#define POSS_NOSAVE
2350 +#endif
2351 +
2352  #endif /* _LINUX_SUSPEND_H */
2353 diff --git a/include/linux/swap.h b/include/linux/swap.h
2354 index de40f16..44050a6 100644
2355 --- a/include/linux/swap.h
2356 +++ b/include/linux/swap.h
2357 @@ -164,6 +164,7 @@ extern unsigned long totalram_pages;
2358  extern unsigned long totalreserve_pages;
2359  extern long nr_swap_pages;
2360  extern unsigned int nr_free_buffer_pages(void);
2361 +extern unsigned int nr_unallocated_buffer_pages(void);
2362  extern unsigned int nr_free_pagecache_pages(void);
2363  
2364  /* Definition of global_page_state not available yet */
2365 @@ -187,6 +188,8 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
2366                                                         gfp_t gfp_mask);
2367  extern int __isolate_lru_page(struct page *page, int mode);
2368  extern unsigned long shrink_all_memory(unsigned long nr_pages);
2369 +extern void shrink_one_zone(struct zone *zone, unsigned long desired_size,
2370 +               int ps_wanted);
2371  extern int vm_swappiness;
2372  extern int remove_mapping(struct address_space *mapping, struct page *page);
2373  extern long vm_total_pages;
2374 @@ -353,5 +356,10 @@ static inline swp_entry_t get_swap_page(void)
2375  #define disable_swap_token() do { } while(0)
2376  
2377  #endif /* CONFIG_SWAP */
2378 +
2379 +/* For TuxOnIce - unlink LRU pages while saving separately */
2380 +void unlink_lru_lists(void);
2381 +void relink_lru_lists(void);
2382 +
2383  #endif /* __KERNEL__*/
2384  #endif /* _LINUX_SWAP_H */
2385 diff --git a/init/do_mounts.c b/init/do_mounts.c
2386 index 3715feb..0463ea1 100644
2387 --- a/init/do_mounts.c
2388 +++ b/init/do_mounts.c
2389 @@ -400,6 +400,8 @@ void __init prepare_namespace(void)
2390         if (is_floppy && rd_doload && rd_load_disk(0))
2391                 ROOT_DEV = Root_RAM0;
2392  
2393 +       check_resume_attempted();
2394 +
2395         mount_root();
2396  out:
2397         sys_mount(".", "/", NULL, MS_MOVE, NULL);
2398 diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
2399 index 614241b..f3ea292 100644
2400 --- a/init/do_mounts_initrd.c
2401 +++ b/init/do_mounts_initrd.c
2402 @@ -6,6 +6,7 @@
2403  #include <linux/romfs_fs.h>
2404  #include <linux/initrd.h>
2405  #include <linux/sched.h>
2406 +#include <linux/suspend.h>
2407  #include <linux/freezer.h>
2408  
2409  #include "do_mounts.h"
2410 @@ -68,6 +69,11 @@ static void __init handle_initrd(void)
2411  
2412         current->flags &= ~PF_FREEZER_SKIP;
2413  
2414 +       if (!resume_attempted)
2415 +               printk(KERN_ERR "TuxOnIce: No attempt was made to resume from "
2416 +                               "any image that might exist.\n");
2417 +       clear_toi_state(TOI_BOOT_TIME);
2418 +
2419         /* move initrd to rootfs' /old */
2420         sys_fchdir(old_fd);
2421         sys_mount("/", ".", NULL, MS_MOVE, NULL);
2422 diff --git a/init/main.c b/init/main.c
2423 index f6f7042..038a94f 100644
2424 --- a/init/main.c
2425 +++ b/init/main.c
2426 @@ -57,6 +57,7 @@
2427  #include <linux/pid_namespace.h>
2428  #include <linux/device.h>
2429  #include <linux/kthread.h>
2430 +#include <linux/dyn_pageflags.h>
2431  #include <linux/sched.h>
2432  #include <linux/signal.h>
2433  #include <linux/idr.h>
2434 @@ -607,6 +608,7 @@ asmlinkage void __init start_kernel(void)
2435         softirq_init();
2436         timekeeping_init();
2437         time_init();
2438 +       dyn_pageflags_init();
2439         sched_clock_init();
2440         profile_init();
2441         if (!irqs_disabled())
2442 @@ -648,6 +650,7 @@ asmlinkage void __init start_kernel(void)
2443         enable_debug_pagealloc();
2444         cpu_hotplug_init();
2445         kmem_cache_init();
2446 +       dyn_pageflags_use_kzalloc();
2447         debug_objects_mem_init();
2448         idr_init_cache();
2449         setup_per_cpu_pageset();
2450 diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
2451 index dcd165f..7c80422 100644
2452 --- a/kernel/power/Kconfig
2453 +++ b/kernel/power/Kconfig
2454 @@ -34,6 +34,18 @@ config PM_VERBOSE
2455         ---help---
2456         This option enables verbose messages from the Power Management code.
2457  
2458 +config PRINTK_NOSAVE
2459 +       depends on PM && PM_DEBUG
2460 +       bool "Preserve printk data from boot kernel when resuming."
2461 +       default n
2462 +       ---help---
2463 +       This option gives printk data and the associated variables the
2464 +       attribute __nosave, which means that they will not be saved as
2465 +       part of the image. The net effect is that after resuming, your
2466 +       dmesg will show the messages from prior to the atomic restore,
2467 +       instead of the messages from the resumed kernel. This may be
2468 +       useful for debugging hibernation.
2469 +
2470  config CAN_PM_TRACE
2471         def_bool y
2472         depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
2473 @@ -179,6 +191,272 @@ config PM_STD_PARTITION
2474           suspended image to. It will simply pick the first available swap 
2475           device.
2476  
2477 +menuconfig TOI_CORE
2478 +       tristate "Enhanced Hibernation (TuxOnIce)"
2479 +       depends on HIBERNATION
2480 +       default y
2481 +       ---help---
2482 +         TuxOnIce is the 'new and improved' suspend support.
2483 +         
2484 +         See the TuxOnIce home page (tuxonice.net)
2485 +         for FAQs, HOWTOs and other documentation.
2486 +
2487 +       comment "Image Storage (you need at least one allocator)"
2488 +               depends on TOI_CORE
2489 +       
2490 +       config TOI_FILE
2491 +               tristate "File Allocator"
2492 +               depends on TOI_CORE
2493 +               default y
2494 +               ---help---
2495 +                 This option enables support for storing an image in a
2496 +                 simple file. This should be possible, but we're still
2497 +                 testing it.
2498 +
2499 +       config TOI_SWAP
2500 +               tristate "Swap Allocator"
2501 +               depends on TOI_CORE && SWAP
2502 +               default y
2503 +               ---help---
2504 +                 This option enables support for storing an image in your
2505 +                 swap space.
2506 +
2507 +       comment "General Options"
2508 +               depends on TOI_CORE
2509 +
2510 +       config TOI_DEFAULT_PRE_HIBERNATE
2511 +               string "Default pre-hibernate command"
2512 +               depends on TOI_CORE
2513 +               ---help---
2514 +                 This entry allows you to specify a command to be run prior
2515 +                 to starting a hibernation cycle. If this command returns
2516 +                 a non-zero result code, hibernating will be aborted. If
2517 +                 you're starting hibernation via the hibernate script,
2518 +                 this value should probably be blank.
2519 +
2520 +       config TOI_DEFAULT_POST_HIBERNATE
2521 +               string "Default post-resume command"
2522 +               depends on TOI_CORE
2523 +               ---help---
2524 +                 This entry allows you to specify a command to be run after
2525 +                 completing a hibernation cycle. The return code of this
2526 +                 command is ignored. If you're starting hibernation via the
2527 +                 hibernate script, this value should probably be blank.
2528 +
2529 +       config TOI_CRYPTO
2530 +               tristate "Compression support"
2531 +               depends on TOI_CORE && CRYPTO
2532 +               default y
2533 +               ---help---
2534 +                 This option adds support for using cryptoapi compression
2535 +                 algorithms. Compression is particularly useful as
2536 +                 the LZF support that comes with the TuxOnIce patch can double
2537 +                 your suspend and resume speed.
2538 +
2539 +                 You probably want this, so say Y here.
2540 +
2541 +       comment "No compression support available without Cryptoapi support."
2542 +               depends on TOI_CORE && !CRYPTO
2543 +
2544 +       config TOI_USERUI
2545 +               tristate "Userspace User Interface support"
2546 +               depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
2547 +               default y
2548 +               ---help---
2549 +                 This option enabled support for a userspace based user interface
2550 +                 to TuxOnIce, which allows you to have a nice display while suspending
2551 +                 and resuming, and also enables features such as pressing escape to
2552 +                 cancel a cycle or interactive debugging.
2553 +
2554 +       config TOI_USERUI_DEFAULT_PATH
2555 +               string "Default userui program location"
2556 +               default "/usr/local/sbin/tuxonice_fbsplash"
2557 +               depends on TOI_USERUI
2558 +               ---help---
2559 +                 This entry allows you to specify a default path to the userui binary.
2560 +
2561 +       config TOI_KEEP_IMAGE
2562 +               bool "Allow Keep Image Mode"
2563 +               depends on TOI_CORE
2564 +               ---help---
2565 +                 This option allows you to keep and image and reuse it. It is intended
2566 +                 __ONLY__ for use with systems where all filesystems are mounted read-
2567 +                 only (kiosks, for example). To use it, compile this option in and boot
2568 +                 normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
2569 +                 When you resume, the image will not be removed. You will be unable to turn
2570 +                 off swap partitions (assuming you are using the swap allocator), but future
2571 +                 suspends simply do a power-down. The image can be updated using the
2572 +                 kernel command line parameter suspend_act= to turn off the keep image
2573 +                 bit. Keep image mode is a little less user friendly on purpose - it
2574 +                 should not be used without thought!
2575 +
2576 +       config TOI_REPLACE_SWSUSP
2577 +               bool "Replace swsusp by default"
2578 +               default y
2579 +               depends on TOI_CORE
2580 +               ---help---
2581 +                 TuxOnIce can replace swsusp. This option makes that the default state,
2582 +                 requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
2583 +                 to use the vanilla kernel functionality. Note that your initrd/ramfs will
2584 +                 need to do this before trying to resume, too.
2585 +                 With overriding swsusp enabled, echoing disk  to /sys/power/state will
2586 +                 start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
2587 +                 the swap and file allocators are compiled in, the swap allocator will be
2588 +                 used by default.
2589 +
2590 +       menuconfig TOI_CLUSTER
2591 +               tristate "Cluster support"
2592 +               default n
2593 +               depends on TOI_CORE && NET && BROKEN
2594 +               ---help---
2595 +                 Support for linking multiple machines in a cluster so that they suspend
2596 +                 and resume together.
2597 +
2598 +       config TOI_DEFAULT_CLUSTER_INTERFACE
2599 +               string "Default cluster interface"
2600 +               depends on TOI_CLUSTER
2601 +               ---help---
2602 +                 The default interface on which to communicate with other nodes in
2603 +                 the cluster.
2604 +                 
2605 +                 If no value is set here, cluster support will be disabled by default.
2606 +
2607 +       config TOI_DEFAULT_CLUSTER_KEY
2608 +               string "Default cluster key"
2609 +               default "Default"
2610 +               depends on TOI_CLUSTER
2611 +               ---help---
2612 +                 The default key used by this node. All nodes in the same cluster
2613 +                 have the same key. Multiple clusters may coexist on the same lan
2614 +                 by using different values for this key.
2615 +
2616 +       config TOI_CLUSTER_IMAGE_TIMEOUT
2617 +               int "Timeout when checking for image"
2618 +               default 15
2619 +               depends on TOI_CLUSTER
2620 +               ---help---
2621 +                 Timeout (seconds) before continuing to boot when waiting to see
2622 +                 whether other nodes might have an image. Set to -1 to wait
2623 +                 indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
2624 +                 booting sooner than this timeout.
2625 +
2626 +       config TOI_CLUSTER_WAIT_UNTIL_NODES
2627 +               int "Nodes without image before continuing"
2628 +               default 0
2629 +               depends on TOI_CLUSTER
2630 +               ---help---
2631 +                 When booting and no image is found, we wait to see if other nodes
2632 +                 have an image before continuing to boot. This value lets us
2633 +                 continue after seeing a certain number of nodes without an image,
2634 +                 instead of continuing to wait for the timeout. Set to 0 to only
2635 +                 use the timeout.
2636 +
2637 +       config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
2638 +               string "Default pre-hibernate script"
2639 +               depends on TOI_CLUSTER
2640 +               ---help---
2641 +                 The default script to be called when starting to hibernate.
2642 +
2643 +       config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
2644 +               string "Default post-hibernate script"
2645 +               depends on TOI_CLUSTER
2646 +               ---help---
2647 +                 The default script to be called after resuming from hibernation.
2648 +
2649 +       config TOI_CHECKSUM
2650 +               bool "Checksum pageset2"
2651 +               default y
2652 +               depends on TOI_CORE
2653 +               select CRYPTO
2654 +               select CRYPTO_ALGAPI
2655 +               select CRYPTO_MD4
2656 +               ---help---
2657 +                 Adds support for checksumming pageset2 pages, to ensure you really get an
2658 +                 atomic copy. Since some filesystems (XFS especially) change metadata even
2659 +                 when there's no other activity, we need this to check for pages that have
2660 +                 been changed while we were saving the page cache. If your debugging output
2661 +                 always says no pages were resaved, you may be able to safely disable this
2662 +                 option.
2663 +
2664 +       config TOI_DEFAULT_WAIT
2665 +               int "Default waiting time for emergency boot messages"
2666 +               default "25"
2667 +               range -1 32768
2668 +               depends on TOI_CORE
2669 +               help
2670 +                 TuxOnIce can display warnings very early in the process of resuming,
2671 +                 if (for example) it appears that you have booted a kernel that doesn't
2672 +                 match an image on disk. It can then give you the opportunity to either
2673 +                 continue booting that kernel, or reboot the machine. This option can be
2674 +                 used to control how long to wait in such circumstances. -1 means wait
2675 +                 forever. 0 means don't wait at all (do the default action, which will
2676 +                 generally be to continue booting and remove the image). Values of 1 or
2677 +                 more indicate a number of seconds (up to 255) to wait before doing the
2678 +                 default.
2679 +
2680 +       config  TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
2681 +               int "Default extra pages allowance"
2682 +               default "500"
2683 +               range 500 32768
2684 +               depends on TOI_CORE
2685 +               help
2686 +                 This value controls the default for the allowance TuxOnIce makes for
2687 +                 drivers to allocate extra memory during the atomic copy. The default
2688 +                 value of 500 will be okay if you're not using DRI. If you are using
2689 +                 DRI, the easiest way to find what value to use is to try to hibernate
2690 +                 and look at how many pages were actually needed in the sysfs entry
2691 +                 /sys/power/tuxonice/debug_info (first number on the last line), adding
2692 +                 a little extra because the value is not always the same.
2693 +
2694 +       config  TOI_PAGEFLAGS_TEST
2695 +               tristate "Test pageflags"
2696 +               default N
2697 +               depends on TOI_CORE
2698 +               help
2699 +                 Test pageflags.
2700 +
2701 +config TOI_PAGEFLAGS_EXPORTS
2702 +       bool
2703 +       depends on TOI_PAGEFLAGS_TEST=m
2704 +       default y
2705 +
2706 +config TOI_USERUI_EXPORTS
2707 +       bool
2708 +       depends on TOI_USERUI=m
2709 +       default y
2710 +
2711 +config TOI_SWAP_EXPORTS
2712 +       bool
2713 +       depends on TOI_SWAP=m
2714 +       default y
2715 +
2716 +config TOI_FILE_EXPORTS
2717 +       bool
2718 +       depends on TOI_FILE=m
2719 +       default y
2720 +
2721 +config TOI_CRYPTO_EXPORTS
2722 +       bool
2723 +       depends on TOI_CRYPTO=m
2724 +       default y
2725 +
2726 +config TOI_CORE_EXPORTS
2727 +       bool
2728 +       depends on TOI_CORE=m
2729 +       default y
2730 +
2731 +config TOI_EXPORTS
2732 +       bool
2733 +       depends on TOI_SWAP_EXPORTS || TOI_FILE_EXPORTS || \
2734 +               TOI_CRYPTO_EXPORTS || TOI_CLUSTER=m || \
2735 +               TOI_USERUI_EXPORTS || TOI_PAGEFLAGS_EXPORTS
2736 +       default y
2737 +
2738 +config TOI
2739 +       bool
2740 +       depends on TOI_CORE!=n
2741 +       default y
2742 +
2743  config APM_EMULATION
2744         tristate "Advanced Power Management Emulation"
2745         depends on PM && SYS_SUPPORTS_APM_EMULATION
2746 diff --git a/kernel/power/Makefile b/kernel/power/Makefile
2747 index 597823b..6d44035 100644
2748 --- a/kernel/power/Makefile
2749 +++ b/kernel/power/Makefile
2750 @@ -4,6 +4,37 @@ EXTRA_CFLAGS   +=      -DDEBUG
2751  endif
2752  
2753  obj-y                          := main.o
2754 +
2755 +tuxonice_core-objs := tuxonice_modules.o tuxonice_sysfs.o tuxonice_highlevel.o \
2756 +               tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
2757 +               tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
2758 +               tuxonice_power_off.o tuxonice_atomic_copy.o
2759 +
2760 +obj-$(CONFIG_TOI)              += tuxonice_builtin.o
2761 +
2762 +ifdef CONFIG_PM_DEBUG
2763 +tuxonice_core-objs             += tuxonice_alloc.o
2764 +endif
2765 +
2766 +ifdef CONFIG_TOI_CHECKSUM
2767 +tuxonice_core-objs             += tuxonice_checksum.o
2768 +endif
2769 +
2770 +ifdef CONFIG_NET
2771 +tuxonice_core-objs             += tuxonice_storage.o tuxonice_netlink.o
2772 +endif
2773 +
2774 +obj-$(CONFIG_TOI_CORE)         += tuxonice_core.o
2775 +obj-$(CONFIG_TOI_CRYPTO)       += tuxonice_compress.o
2776 +
2777 +obj-$(CONFIG_TOI_SWAP)         += tuxonice_block_io.o tuxonice_swap.o
2778 +obj-$(CONFIG_TOI_FILE)         += tuxonice_block_io.o tuxonice_file.o
2779 +obj-$(CONFIG_TOI_CLUSTER)      += tuxonice_cluster.o
2780 +
2781 +obj-$(CONFIG_TOI_USERUI)       += tuxonice_userui.o
2782 +
2783 +obj-$(CONFIG_TOI_PAGEFLAGS_TEST)       += toi_pageflags_test.o
2784 +
2785  obj-$(CONFIG_PM_SLEEP)         += process.o console.o
2786  obj-$(CONFIG_HIBERNATION)      += swsusp.o disk.o snapshot.o swap.o user.o
2787  
2788 diff --git a/kernel/power/disk.c b/kernel/power/disk.c
2789 index bbd85c6..2e444e6 100644
2790 --- a/kernel/power/disk.c
2791 +++ b/kernel/power/disk.c
2792 @@ -25,9 +25,11 @@
2793  
2794  #include "power.h"
2795  
2796 +#include "tuxonice.h"
2797 +#include "tuxonice_builtin.h"
2798  
2799  static int noresume = 0;
2800 -static char resume_file[256] = CONFIG_PM_STD_PARTITION;
2801 +char resume_file[256] = CONFIG_PM_STD_PARTITION;
2802  dev_t swsusp_resume_device;
2803  sector_t swsusp_resume_block;
2804  
2805 @@ -105,7 +107,7 @@ static int hibernation_test(int level) { return 0; }
2806   *     hibernation
2807   */
2808  
2809 -static int platform_begin(int platform_mode)
2810 +int platform_begin(int platform_mode)
2811  {
2812         return (platform_mode && hibernation_ops) ?
2813                 hibernation_ops->begin() : 0;
2814 @@ -116,7 +118,7 @@ static int platform_begin(int platform_mode)
2815   *     working state
2816   */
2817  
2818 -static void platform_end(int platform_mode)
2819 +void platform_end(int platform_mode)
2820  {
2821         if (platform_mode && hibernation_ops)
2822                 hibernation_ops->end();
2823 @@ -127,7 +129,7 @@ static void platform_end(int platform_mode)
2824   *     platform driver if so configured and return an error code if it fails
2825   */
2826  
2827 -static int platform_pre_snapshot(int platform_mode)
2828 +int platform_pre_snapshot(int platform_mode)
2829  {
2830         return (platform_mode && hibernation_ops) ?
2831                 hibernation_ops->pre_snapshot() : 0;
2832 @@ -138,7 +140,7 @@ static int platform_pre_snapshot(int platform_mode)
2833   *     of operation using the platform driver (called with interrupts disabled)
2834   */
2835  
2836 -static void platform_leave(int platform_mode)
2837 +void platform_leave(int platform_mode)
2838  {
2839         if (platform_mode && hibernation_ops)
2840                 hibernation_ops->leave();
2841 @@ -149,7 +151,7 @@ static void platform_leave(int platform_mode)
2842   *     using the platform driver (must be called after platform_prepare())
2843   */
2844  
2845 -static void platform_finish(int platform_mode)
2846 +void platform_finish(int platform_mode)
2847  {
2848         if (platform_mode && hibernation_ops)
2849                 hibernation_ops->finish();
2850 @@ -161,7 +163,7 @@ static void platform_finish(int platform_mode)
2851   *     called, platform_restore_cleanup() must be called.
2852   */
2853  
2854 -static int platform_pre_restore(int platform_mode)
2855 +int platform_pre_restore(int platform_mode)
2856  {
2857         return (platform_mode && hibernation_ops) ?
2858                 hibernation_ops->pre_restore() : 0;
2859 @@ -174,7 +176,7 @@ static int platform_pre_restore(int platform_mode)
2860   *     regardless of the result of platform_pre_restore().
2861   */
2862  
2863 -static void platform_restore_cleanup(int platform_mode)
2864 +void platform_restore_cleanup(int platform_mode)
2865  {
2866         if (platform_mode && hibernation_ops)
2867                 hibernation_ops->restore_cleanup();
2868 @@ -508,6 +510,11 @@ int hibernate(void)
2869  {
2870         int error;
2871  
2872 +#ifdef CONFIG_TOI
2873 +       if (test_action_state(TOI_REPLACE_SWSUSP))
2874 +               return toi_try_hibernate(1);
2875 +#endif
2876 +
2877         mutex_lock(&pm_mutex);
2878         /* The snapshot device should not be opened while we're running */
2879         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
2880 @@ -580,10 +587,21 @@ int hibernate(void)
2881   *
2882   */
2883  
2884 -static int software_resume(void)
2885 +int software_resume(void)
2886  {
2887         int error;
2888         unsigned int flags;
2889 +       resume_attempted = 1;
2890 +
2891 +#ifdef CONFIG_TOI
2892 +       /*
2893 +        * We can't know (until an image header - if any - is loaded), whether
2894 +        * we did override swsusp. We therefore ensure that both are tried.
2895 +        */
2896 +       if (test_action_state(TOI_REPLACE_SWSUSP))
2897 +               printk(KERN_INFO "Replacing swsusp.\n");
2898 +               toi_try_resume();
2899 +#endif
2900  
2901         /*
2902          * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
2903 @@ -596,6 +614,7 @@ static int software_resume(void)
2904          * here to avoid lockdep complaining.
2905          */
2906         mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
2907 +
2908         if (!swsusp_resume_device) {
2909                 if (!strlen(resume_file)) {
2910                         mutex_unlock(&pm_mutex);
2911 @@ -667,9 +686,6 @@ static int software_resume(void)
2912         return error;
2913  }
2914  
2915 -late_initcall(software_resume);
2916 -
2917 -
2918  static const char * const hibernation_modes[] = {
2919         [HIBERNATION_PLATFORM]  = "platform",
2920         [HIBERNATION_SHUTDOWN]  = "shutdown",
2921 @@ -882,6 +898,7 @@ static int __init resume_offset_setup(char *str)
2922  static int __init noresume_setup(char *str)
2923  {
2924         noresume = 1;
2925 +       set_toi_state(TOI_NORESUME_SPECIFIED);
2926         return 1;
2927  }
2928  
2929 diff --git a/kernel/power/power.h b/kernel/power/power.h
2930 index acc0c10..6a4a5b4 100644
2931 --- a/kernel/power/power.h
2932 +++ b/kernel/power/power.h
2933 @@ -1,7 +1,16 @@
2934 +/*
2935 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
2936 + */
2937 +
2938 +#ifndef KERNEL_POWER_POWER_H
2939 +#define KERNEL_POWER_POWER_H
2940 +
2941  #include <linux/suspend.h>
2942  #include <linux/suspend_ioctls.h>
2943  #include <linux/utsname.h>
2944  #include <linux/freezer.h>
2945 +#include "tuxonice.h"
2946 +#include "tuxonice_builtin.h"
2947  
2948  struct swsusp_info {
2949         struct new_utsname      uts;
2950 @@ -21,18 +30,22 @@ struct swsusp_info {
2951  extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
2952  extern int arch_hibernation_header_restore(void *addr);
2953  
2954 -static inline int init_header_complete(struct swsusp_info *info)
2955 +static inline int init_swsusp_header_complete(struct swsusp_info *info)
2956  {
2957         return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
2958  }
2959  
2960 -static inline char *check_image_kernel(struct swsusp_info *info)
2961 +static inline char *check_swsusp_image_kernel(struct swsusp_info *info)
2962  {
2963         return arch_hibernation_header_restore(info) ?
2964                         "architecture specific data" : NULL;
2965  }
2966 +#else
2967 +extern char *check_swsusp_image_kernel(struct swsusp_info *info);
2968  #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
2969 +extern int init_swsusp_header(struct swsusp_info *info);
2970  
2971 +extern char resume_file[256];
2972  /*
2973   * Keep some memory free so that I/O operations can succeed without paging
2974   * [Might this be more than 4 MB?]
2975 @@ -63,6 +76,8 @@ static struct kobj_attribute _name##_attr = { \
2976         .store  = _name##_store,                \
2977  }
2978  
2979 +extern struct pbe *restore_pblist;
2980 +
2981  /* Preferred image size in bytes (default 500 MB) */
2982  extern unsigned long image_size;
2983  extern int in_suspend;
2984 @@ -223,3 +238,26 @@ static inline void suspend_thaw_processes(void)
2985  {
2986  }
2987  #endif
2988 +
2989 +extern struct page *saveable_page(unsigned long pfn);
2990 +#ifdef CONFIG_HIGHMEM
2991 +extern struct page *saveable_highmem_page(unsigned long pfn);
2992 +#else
2993 +static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
2994 +#endif
2995 +
2996 +#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
2997 +extern struct list_head nosave_regions;
2998 +
2999 +/**
3000 + *     This structure represents a range of page frames the contents of which
3001 + *     should not be saved during the suspend.
3002 + */
3003 +
3004 +struct nosave_region {
3005 +       struct list_head list;
3006 +       unsigned long start_pfn;
3007 +       unsigned long end_pfn;
3008 +};
3009 +
3010 +#endif
3011 diff --git a/kernel/power/process.c b/kernel/power/process.c
3012 index 278946a..ba2eb75 100644
3013 --- a/kernel/power/process.c
3014 +++ b/kernel/power/process.c
3015 @@ -13,6 +13,10 @@
3016  #include <linux/module.h>
3017  #include <linux/syscalls.h>
3018  #include <linux/freezer.h>
3019 +#include <linux/buffer_head.h>
3020 +
3021 +int freezer_state;
3022 +EXPORT_SYMBOL(freezer_state);
3023  
3024  /* 
3025   * Timeout for stopping processes
3026 @@ -201,7 +205,8 @@ static int try_to_freeze_tasks(bool sig_only)
3027                 do_each_thread(g, p) {
3028                         task_lock(p);
3029                         if (freezing(p) && !freezer_should_skip(p))
3030 -                               printk(KERN_ERR " %s\n", p->comm);
3031 +                               printk(KERN_ERR " %s (%d) failed to freeze.\n",
3032 +                                               p->comm, p->pid);
3033                         cancel_freezing(p);
3034                         task_unlock(p);
3035                 } while_each_thread(g, p);
3036 @@ -221,17 +226,25 @@ int freeze_processes(void)
3037  {
3038         int error;
3039  
3040 -       printk("Freezing user space processes ... ");
3041 +       printk(KERN_INFO "Stopping fuse filesystems.\n");
3042 +       freeze_filesystems(FS_FREEZER_FUSE);
3043 +       freezer_state = FREEZER_FILESYSTEMS_FROZEN;
3044 +       printk(KERN_INFO "Freezing user space processes ... ");
3045         error = try_to_freeze_tasks(true);
3046         if (error)
3047                 goto Exit;
3048 -       printk("done.\n");
3049 +       printk(KERN_INFO "done.\n");
3050  
3051 -       printk("Freezing remaining freezable tasks ... ");
3052 +       sys_sync();
3053 +       printk(KERN_INFO "Stopping normal filesystems.\n");
3054 +       freeze_filesystems(FS_FREEZER_NORMAL);
3055 +       freezer_state = FREEZER_USERSPACE_FROZEN;
3056 +       printk(KERN_INFO "Freezing remaining freezable tasks ... ");
3057         error = try_to_freeze_tasks(false);
3058         if (error)
3059                 goto Exit;
3060         printk("done.");
3061 +       freezer_state = FREEZER_FULLY_ON;
3062   Exit:
3063         BUG_ON(in_atomic());
3064         printk("\n");
3065 @@ -257,11 +270,35 @@ static void thaw_tasks(bool nosig_only)
3066  
3067  void thaw_processes(void)
3068  {
3069 -       printk("Restarting tasks ... ");
3070 -       thaw_tasks(true);
3071 +       int old_state = freezer_state;
3072 +
3073 +       if (old_state == FREEZER_OFF)
3074 +               return;
3075 +
3076 +       /*
3077 +        * Change state beforehand because thawed tasks might submit I/O
3078 +        * immediately.
3079 +        */
3080 +       freezer_state = FREEZER_OFF;
3081 +
3082 +       printk(KERN_INFO "Restarting all filesystems ...\n");
3083 +       thaw_filesystems(FS_FREEZER_ALL);
3084 +
3085 +       printk(KERN_INFO "Restarting tasks ... ");
3086 +
3087 +       if (old_state == FREEZER_FULLY_ON)
3088 +               thaw_tasks(true);
3089         thaw_tasks(false);
3090         schedule();
3091         printk("done.\n");
3092  }
3093  
3094  EXPORT_SYMBOL(refrigerator);
3095 +
3096 +void thaw_kernel_threads(void)
3097 +{
3098 +       freezer_state = FREEZER_USERSPACE_FROZEN;
3099 +       printk(KERN_INFO "Restarting normal filesystems.\n");
3100 +       thaw_filesystems(FS_FREEZER_NORMAL);
3101 +       thaw_tasks(true);
3102 +}
3103 diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
3104 index 5d2ab83..bdb3071 100644
3105 --- a/kernel/power/snapshot.c
3106 +++ b/kernel/power/snapshot.c
3107 @@ -33,6 +33,7 @@
3108  #include <asm/io.h>
3109  
3110  #include "power.h"
3111 +#include "tuxonice_builtin.h"
3112  
3113  static int swsusp_page_is_free(struct page *);
3114  static void swsusp_set_page_forbidden(struct page *);
3115 @@ -44,6 +45,12 @@ static void swsusp_unset_page_forbidden(struct page *);
3116   * directly to their "original" page frames.
3117   */
3118  struct pbe *restore_pblist;
3119 +int resume_attempted;
3120 +EXPORT_SYMBOL_GPL(resume_attempted);
3121 +
3122 +#ifdef CONFIG_TOI
3123 +#include "tuxonice_pagedir.h"
3124 +#endif
3125  
3126  /* Pointer to an auxiliary buffer (1 page) */
3127  static void *buffer;
3128 @@ -86,6 +93,11 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
3129  
3130  unsigned long get_safe_page(gfp_t gfp_mask)
3131  {
3132 +#ifdef CONFIG_TOI
3133 +       if (toi_running)
3134 +               return toi_get_nonconflicting_page();
3135 +#endif
3136 +
3137         return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
3138  }
3139  
3140 @@ -561,18 +573,8 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
3141         return bb->start_pfn + bit;
3142  }
3143  
3144 -/**
3145 - *     This structure represents a range of page frames the contents of which
3146 - *     should not be saved during the suspend.
3147 - */
3148 -
3149 -struct nosave_region {
3150 -       struct list_head list;
3151 -       unsigned long start_pfn;
3152 -       unsigned long end_pfn;
3153 -};
3154 -
3155 -static LIST_HEAD(nosave_regions);
3156 +LIST_HEAD(nosave_regions);
3157 +EXPORT_SYMBOL_GPL(nosave_regions);
3158  
3159  /**
3160   *     register_nosave_region - register a range of page frames the contents
3161 @@ -809,7 +811,7 @@ static unsigned int count_free_highmem_pages(void)
3162   *     and it isn't a part of a free chunk of pages.
3163   */
3164  
3165 -static struct page *saveable_highmem_page(unsigned long pfn)
3166 +struct page *saveable_highmem_page(unsigned long pfn)
3167  {
3168         struct page *page;
3169  
3170 @@ -851,8 +853,6 @@ unsigned int count_highmem_pages(void)
3171         }
3172         return n;
3173  }
3174 -#else
3175 -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
3176  #endif /* CONFIG_HIGHMEM */
3177  
3178  /**
3179 @@ -864,7 +864,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
3180   *     a free chunk of pages.
3181   */
3182  
3183 -static struct page *saveable_page(unsigned long pfn)
3184 +struct page *saveable_page(unsigned long pfn)
3185  {
3186         struct page *page;
3187  
3188 @@ -1198,6 +1198,11 @@ asmlinkage int swsusp_save(void)
3189  {
3190         unsigned int nr_pages, nr_highmem;
3191  
3192 +#ifdef CONFIG_TOI
3193 +       if (toi_running)
3194 +               return toi_post_context_save();
3195 +#endif
3196 +
3197         printk(KERN_INFO "PM: Creating hibernation image: \n");
3198  
3199         drain_local_pages(NULL);
3200 @@ -1238,14 +1243,14 @@ asmlinkage int swsusp_save(void)
3201  }
3202  
3203  #ifndef CONFIG_ARCH_HIBERNATION_HEADER
3204 -static int init_header_complete(struct swsusp_info *info)
3205 +int init_swsusp_header_complete(struct swsusp_info *info)
3206  {
3207         memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
3208         info->version_code = LINUX_VERSION_CODE;
3209         return 0;
3210  }
3211  
3212 -static char *check_image_kernel(struct swsusp_info *info)
3213 +char *check_swsusp_image_kernel(struct swsusp_info *info)
3214  {
3215         if (info->version_code != LINUX_VERSION_CODE)
3216                 return "kernel version";
3217 @@ -1259,6 +1264,7 @@ static char *check_image_kernel(struct swsusp_info *info)
3218                 return "machine";
3219         return NULL;
3220  }
3221 +EXPORT_SYMBOL_GPL(check_swsusp_image_kernel);
3222  #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
3223  
3224  unsigned long snapshot_get_image_size(void)
3225 @@ -1266,7 +1272,7 @@ unsigned long snapshot_get_image_size(void)
3226         return nr_copy_pages + nr_meta_pages + 1;
3227  }
3228  
3229 -static int init_header(struct swsusp_info *info)
3230 +int init_swsusp_header(struct swsusp_info *info)
3231  {
3232         memset(info, 0, sizeof(struct swsusp_info));
3233         info->num_physpages = num_physpages;
3234 @@ -1274,7 +1280,7 @@ static int init_header(struct swsusp_info *info)
3235         info->pages = snapshot_get_image_size();
3236         info->size = info->pages;
3237         info->size <<= PAGE_SHIFT;
3238 -       return init_header_complete(info);
3239 +       return init_swsusp_header_complete(info);
3240  }
3241  
3242  /**
3243 @@ -1330,7 +1336,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
3244         if (!handle->offset) {
3245                 int error;
3246  
3247 -               error = init_header((struct swsusp_info *)buffer);
3248 +               error = init_swsusp_header((struct swsusp_info *)buffer);
3249                 if (error)
3250                         return error;
3251                 handle->buffer = buffer;
3252 @@ -1427,7 +1433,7 @@ static int check_header(struct swsusp_info *info)
3253  {
3254         char *reason;
3255  
3256 -       reason = check_image_kernel(info);
3257 +       reason = check_swsusp_image_kernel(info);
3258         if (!reason && info->num_physpages != num_physpages)
3259                 reason = "memory size";
3260         if (reason) {
3261 diff --git a/kernel/power/toi_pageflags_test.c b/kernel/power/toi_pageflags_test.c
3262 new file mode 100644
3263 index 0000000..381f05b
3264 --- /dev/null
3265 +++ b/kernel/power/toi_pageflags_test.c
3266 @@ -0,0 +1,80 @@
3267 +/*
3268 + * TuxOnIce pageflags tester.
3269 + */
3270 +
3271 +#include "linux/module.h"
3272 +#include "linux/bootmem.h"
3273 +#include "linux/sched.h"
3274 +#include "linux/dyn_pageflags.h"
3275 +
3276 +DECLARE_DYN_PAGEFLAGS(test_map);
3277 +
3278 +static char *bits_on(void)
3279 +{
3280 +       char *page = (char *) get_zeroed_page(GFP_KERNEL);
3281 +       unsigned long index = get_next_bit_on(&test_map, max_pfn + 1);
3282 +       int pos = 0;
3283 +
3284 +       while (index <= max_pfn) {
3285 +               pos += snprintf_used(page + pos, PAGE_SIZE - pos - 1, "%d ",
3286 +                               index);
3287 +               index = get_next_bit_on(&test_map, index);
3288 +       }
3289 +
3290 +       return page;
3291 +}
3292 +
3293 +static __init int do_check(void)
3294 +{
3295 +       unsigned long index;
3296 +       int step = 1, steps = 100;
3297 +
3298 +       allocate_dyn_pageflags(&test_map, 0);
3299 +
3300 +       for (index = 1; index < max_pfn; index++) {
3301 +               char *result;
3302 +               char compare[100];
3303 +
3304 +               if (index > (max_pfn / steps * step)) {
3305 +                       printk(KERN_INFO "%d/%d\r", step, steps);
3306 +                       step++;
3307 +               }
3308 +
3309 +
3310 +               if (!pfn_valid(index))
3311 +                       continue;
3312 +
3313 +               clear_dyn_pageflags(&test_map);
3314 +               set_dynpageflag(&test_map, pfn_to_page(0));
3315 +               set_dynpageflag(&test_map, pfn_to_page(index));
3316 +
3317 +               sprintf(compare, "0 %lu ", index);
3318 +
3319 +               result = bits_on();
3320 +
3321 +               if (strcmp(result, compare)) {
3322 +                       printk(KERN_INFO "Expected \"%s\", got \"%s\"\n",
3323 +                                       result, compare);
3324 +               }
3325 +
3326 +               free_page((unsigned long) result);
3327 +               schedule();
3328 +       }
3329 +
3330 +       free_dyn_pageflags(&test_map);
3331 +       return 0;
3332 +}
3333 +
3334 +#ifdef MODULE
3335 +static __exit void check_unload(void)
3336 +{
3337 +}
3338 +
3339 +module_init(do_check);
3340 +module_exit(check_unload);
3341 +MODULE_AUTHOR("Nigel Cunningham");
3342 +MODULE_DESCRIPTION("Pageflags testing");
3343 +MODULE_LICENSE("GPL");
3344 +#else
3345 +late_initcall(do_check);
3346 +#endif
3347 diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
3348 new file mode 100644
3349 index 0000000..912e89a
3350 --- /dev/null
3351 +++ b/kernel/power/tuxonice.h
3352 @@ -0,0 +1,210 @@
3353 +/*
3354 + * kernel/power/tuxonice.h
3355 + *
3356 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
3357 + *
3358 + * This file is released under the GPLv2.
3359 + *
3360 + * It contains declarations used throughout swsusp.
3361 + *
3362 + */
3363 +
3364 +#ifndef KERNEL_POWER_TOI_H
3365 +#define KERNEL_POWER_TOI_H
3366 +
3367 +#include <linux/delay.h>
3368 +#include <linux/bootmem.h>
3369 +#include <linux/suspend.h>
3370 +#include <linux/dyn_pageflags.h>
3371 +#include <linux/fs.h>
3372 +#include <linux/kmod.h>
3373 +#include <asm/setup.h>
3374 +#include "tuxonice_pageflags.h"
3375 +
3376 +#define TOI_CORE_VERSION "3.0-rc7"
3377 +
3378 +#define MY_BOOT_KERNEL_DATA_VERSION 1
3379 +
3380 +struct toi_boot_kernel_data {
3381 +       int version;
3382 +       int size;
3383 +       unsigned long toi_action;
3384 +       unsigned long toi_debug_state;
3385 +       int toi_default_console_level;
3386 +       int toi_io_time[2][2];
3387 +       char toi_nosave_commandline[COMMAND_LINE_SIZE];
3388 +};
3389 +
3390 +extern struct toi_boot_kernel_data toi_bkd;
3391 +
3392 +/* Location of book kernel data struct in kernel being resumed */
3393 +extern unsigned long boot_kernel_data_buffer;
3394 +
3395 +/*              == Action states ==            */
3396 +
3397 +enum {
3398 +       TOI_REBOOT,
3399 +       TOI_PAUSE,
3400 +       TOI_LOGALL,
3401 +       TOI_CAN_CANCEL,
3402 +       TOI_KEEP_IMAGE,
3403 +       TOI_FREEZER_TEST,
3404 +       TOI_SINGLESTEP,
3405 +       TOI_PAUSE_NEAR_PAGESET_END,
3406 +       TOI_TEST_FILTER_SPEED,
3407 +       TOI_TEST_BIO,
3408 +       TOI_NO_PAGESET2,
3409 +       TOI_PM_PREPARE_CONSOLE,
3410 +       TOI_IGNORE_ROOTFS,
3411 +       TOI_REPLACE_SWSUSP,
3412 +       TOI_PAGESET2_FULL,
3413 +       TOI_ABORT_ON_RESAVE_NEEDED,
3414 +       TOI_NO_MULTITHREADED_IO,
3415 +       TOI_NO_DIRECT_LOAD,
3416 +       TOI_LATE_CPU_HOTPLUG,
3417 +       TOI_GET_MAX_MEM_ALLOCD,
3418 +       TOI_NO_FLUSHER_THREAD,
3419 +};
3420 +
3421 +#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
3422 +#define test_action_state(bit) (test_bit(bit, &toi_bkd.toi_action))
3423 +
3424 +/*              == Result states ==            */
3425 +
3426 +enum {
3427 +       TOI_ABORTED,
3428 +       TOI_ABORT_REQUESTED,
3429 +       TOI_NOSTORAGE_AVAILABLE,
3430 +       TOI_INSUFFICIENT_STORAGE,
3431 +       TOI_FREEZING_FAILED,
3432 +       TOI_KEPT_IMAGE,
3433 +       TOI_WOULD_EAT_MEMORY,
3434 +       TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
3435 +       TOI_PM_SEM,
3436 +       TOI_DEVICE_REFUSED,
3437 +       TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
3438 +       TOI_UNABLE_TO_PREPARE_IMAGE,
3439 +       TOI_FAILED_MODULE_INIT,
3440 +       TOI_FAILED_MODULE_CLEANUP,
3441 +       TOI_FAILED_IO,
3442 +       TOI_OUT_OF_MEMORY,
3443 +       TOI_IMAGE_ERROR,
3444 +       TOI_PLATFORM_PREP_FAILED,
3445 +       TOI_CPU_HOTPLUG_FAILED,
3446 +       TOI_ARCH_PREPARE_FAILED,
3447 +       TOI_RESAVE_NEEDED,
3448 +       TOI_CANT_SUSPEND,
3449 +       TOI_NOTIFIERS_PREPARE_FAILED,
3450 +       TOI_PRE_SNAPSHOT_FAILED,
3451 +       TOI_PRE_RESTORE_FAILED,
3452 +};
3453 +
3454 +extern unsigned long toi_result;
3455 +
3456 +#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
3457 +#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
3458 +                               test_and_set_bit(bit, &toi_result))
3459 +#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
3460 +#define test_result_state(bit) (test_bit(bit, &toi_result))
3461 +
3462 +/*      == Debug sections and levels ==        */
3463 +
3464 +/* debugging levels. */
3465 +enum {
3466 +       TOI_STATUS = 0,
3467 +       TOI_ERROR = 2,
3468 +       TOI_LOW,
3469 +       TOI_MEDIUM,
3470 +       TOI_HIGH,
3471 +       TOI_VERBOSE,
3472 +};
3473 +
3474 +enum {
3475 +       TOI_ANY_SECTION,
3476 +       TOI_EAT_MEMORY,
3477 +       TOI_IO,
3478 +       TOI_HEADER,
3479 +       TOI_WRITER,
3480 +       TOI_MEMORY,
3481 +};
3482 +
3483 +#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
3484 +#define clear_debug_state(bit) \
3485 +       (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
3486 +#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
3487 +
3488 +/*             == Steps in hibernating ==      */
3489 +
3490 +enum {
3491 +       STEP_HIBERNATE_PREPARE_IMAGE,
3492 +       STEP_HIBERNATE_SAVE_IMAGE,
3493 +       STEP_HIBERNATE_POWERDOWN,
3494 +       STEP_RESUME_CAN_RESUME,
3495 +       STEP_RESUME_LOAD_PS1,
3496 +       STEP_RESUME_DO_RESTORE,
3497 +       STEP_RESUME_READ_PS2,
3498 +       STEP_RESUME_GO,
3499 +       STEP_RESUME_ALT_IMAGE,
3500 +       STEP_CLEANUP,
3501 +       STEP_QUIET_CLEANUP
3502 +};
3503 +
3504 +/*             == TuxOnIce states ==
3505 +       (see also include/linux/suspend.h)      */
3506 +
3507 +#define get_toi_state()  (toi_state)
3508 +#define restore_toi_state(saved_state) \
3509 +       do { toi_state = saved_state; } while (0)
3510 +
3511 +/*             == Module support ==            */
3512 +
3513 +struct toi_core_fns {
3514 +       int (*post_context_save)(void);
3515 +       unsigned long (*get_nonconflicting_page)(void);
3516 +       int (*try_hibernate)(int have_pmsem);
3517 +       void (*try_resume)(void);
3518 +};
3519 +
3520 +extern struct toi_core_fns *toi_core_fns;
3521 +
3522 +/*             == All else ==                  */
3523 +#define KB(x) ((x) << (PAGE_SHIFT - 10))
3524 +#define MB(x) ((x) >> (20 - PAGE_SHIFT))
3525 +
3526 +extern int toi_start_anything(int toi_or_resume);
3527 +extern void toi_finish_anything(int toi_or_resume);
3528 +
3529 +extern int save_image_part1(void);
3530 +extern int toi_atomic_restore(void);
3531 +
3532 +extern int _toi_try_hibernate(int have_pmsem);
3533 +extern void __toi_try_resume(void);
3534 +
3535 +extern int __toi_post_context_save(void);
3536 +
3537 +extern unsigned int nr_hibernates;
3538 +extern char alt_resume_param[256];
3539 +
3540 +extern void copyback_post(void);
3541 +extern int toi_hibernate(void);
3542 +extern long extra_pd1_pages_used;
3543 +
3544 +#define SECTOR_SIZE 512
3545 +
3546 +extern void toi_early_boot_message(int can_erase_image, int default_answer,
3547 +       char *warning_reason, ...);
3548 +
3549 +static inline int load_direct(struct page *page)
3550 +{
3551 +       return test_action_state(TOI_NO_DIRECT_LOAD) ? 0 :
3552 +               PagePageset1Copy(page);
3553 +}
3554 +
3555 +extern int pre_resume_freeze(void);
3556 +extern int do_check_can_resume(void);
3557 +extern int do_toi_step(int step);
3558 +extern int toi_launch_userspace_program(char *command, int channel_no,
3559 +               enum umh_wait wait);
3560 +
3561 +extern char *tuxonice_signature;
3562 +#endif
3563 diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
3564 new file mode 100644
3565 index 0000000..101f65b
3566 --- /dev/null
3567 +++ b/kernel/power/tuxonice_alloc.c
3568 @@ -0,0 +1,293 @@
3569 +/*
3570 + * kernel/power/tuxonice_alloc.c
3571 + *
3572 + * Copyright (C) 2007 Nigel Cunningham (nigel at tuxonice net)
3573 + *
3574 + * This file is released under the GPLv2.
3575 + *
3576 + */
3577 +
3578 +#ifdef CONFIG_PM_DEBUG
3579 +#include <linux/slab.h>
3580 +#include <linux/module.h>
3581 +#include "tuxonice_modules.h"
3582 +#include "tuxonice_sysfs.h"
3583 +
3584 +#define TOI_ALLOC_PATHS 39
3585 +
3586 +DEFINE_MUTEX(toi_alloc_mutex);
3587 +
3588 +static struct toi_module_ops toi_alloc_ops;
3589 +
3590 +static int toi_fail_num;
3591 +static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
3592 +               toi_free_count[TOI_ALLOC_PATHS],
3593 +               toi_test_count[TOI_ALLOC_PATHS],
3594 +               toi_fail_count[TOI_ALLOC_PATHS];
3595 +int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
3596 +int cur_allocd, max_allocd;
3597 +
3598 +static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
3599 +       "", /* 0 */
3600 +       "get_io_info_struct",
3601 +       "extent",
3602 +       "extent (loading chain)",
3603 +       "userui channel",
3604 +       "userui arg", /* 5 */
3605 +       "attention list metadata",
3606 +       "extra pagedir memory metadata",
3607 +       "bdev metadata",
3608 +       "extra pagedir memory",
3609 +       "header_locations_read", /* 10 */
3610 +       "bio queue",
3611 +       "prepare_readahead",
3612 +       "i/o buffer",
3613 +       "writer buffer in bio_init",
3614 +       "checksum buffer", /* 15 */
3615 +       "compression buffer",
3616 +       "filewriter signature op",
3617 +       "set resume param alloc1",
3618 +       "set resume param alloc2",
3619 +       "debugging info buffer", /* 20 */
3620 +       "check can resume buffer",
3621 +       "write module config buffer",
3622 +       "read module config buffer",
3623 +       "write image header buffer",
3624 +       "read pageset1 buffer", /* 25 */
3625 +       "get_have_image_data buffer",
3626 +       "checksum page",
3627 +       "worker rw loop",
3628 +       "get nonconflicting page",
3629 +       "ps1 load addresses", /* 30 */
3630 +       "remove swap image",
3631 +       "swap image exists",
3632 +       "swap parse sig location",
3633 +       "sysfs kobj",
3634 +       "swap mark resume attempted buffer", /* 35 */
3635 +       "cluster member",
3636 +       "boot kernel data buffer",
3637 +       "setting swap signature"
3638 +};
3639 +
3640 +#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
3641 +       do { \
3642 +               BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
3643 +               \
3644 +               if (FAIL_NUM == toi_fail_num) { \
3645 +                       atomic_inc(&toi_test_count[FAIL_NUM]); \
3646 +                       toi_fail_num = 0; \
3647 +                       return FAIL_VAL; \
3648 +               } \
3649 +       } while (0)
3650 +
3651 +static void alloc_update_stats(int fail_num, void *result)
3652 +{
3653 +       if (!result) {
3654 +               atomic_inc(&toi_fail_count[fail_num]);
3655 +               return;
3656 +       }
3657 +
3658 +       atomic_inc(&toi_alloc_count[fail_num]);
3659 +       if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
3660 +               mutex_lock(&toi_alloc_mutex);
3661 +               toi_cur_allocd[fail_num]++;
3662 +               cur_allocd++;
3663 +               if (unlikely(cur_allocd > max_allocd)) {
3664 +                       int i;
3665 +
3666 +                       for (i = 0; i < TOI_ALLOC_PATHS; i++)
3667 +                               toi_max_allocd[i] = toi_cur_allocd[i];
3668 +                       max_allocd = cur_allocd;
3669 +               }
3670 +               mutex_unlock(&toi_alloc_mutex);
3671 +       }
3672 +}
3673 +
3674 +static void free_update_stats(int fail_num)
3675 +{
3676 +       BUG_ON(fail_num >= TOI_ALLOC_PATHS);
3677 +       atomic_inc(&toi_free_count[fail_num]);
3678 +       if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
3679 +               mutex_lock(&toi_alloc_mutex);
3680 +               cur_allocd--;
3681 +               toi_cur_allocd[fail_num]--;
3682 +               mutex_unlock(&toi_alloc_mutex);
3683 +       }
3684 +}
3685 +
3686 +void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
3687 +{
3688 +       void *result;
3689 +
3690 +       if (toi_alloc_ops.enabled)
3691 +               MIGHT_FAIL(fail_num, NULL);
3692 +       result = kzalloc(size, flags);
3693 +       if (toi_alloc_ops.enabled)
3694 +               alloc_update_stats(fail_num, result);
3695 +       return result;
3696 +}
3697 +
3698 +unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
3699 +               unsigned int order)
3700 +{
3701 +       unsigned long result;
3702 +
3703 +       if (toi_alloc_ops.enabled)
3704 +               MIGHT_FAIL(fail_num, 0);
3705 +       result = __get_free_pages(mask, order);
3706 +       if (toi_alloc_ops.enabled)
3707 +               alloc_update_stats(fail_num, (void *) result);
3708 +       return result;
3709 +}
3710 +
3711 +struct page *toi_alloc_page(int fail_num, gfp_t mask)
3712 +{
3713 +       struct page *result;
3714 +
3715 +       if (toi_alloc_ops.enabled)
3716 +               MIGHT_FAIL(fail_num, 0);
3717 +       result = alloc_page(mask);
3718 +       if (toi_alloc_ops.enabled)
3719 +               alloc_update_stats(fail_num, (void *) result);
3720 +       return result;
3721 +}
3722 +
3723 +unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
3724 +{
3725 +       unsigned long result;
3726 +
3727 +       if (toi_alloc_ops.enabled)
3728 +               MIGHT_FAIL(fail_num, 0);
3729 +       result = get_zeroed_page(mask);
3730 +       if (toi_alloc_ops.enabled)
3731 +               alloc_update_stats(fail_num, (void *) result);
3732 +       return result;
3733 +}
3734 +
3735 +void toi_kfree(int fail_num, const void *arg)
3736 +{
3737 +       if (arg && toi_alloc_ops.enabled)
3738 +               free_update_stats(fail_num);
3739 +
3740 +       kfree(arg);
3741 +}
3742 +
3743 +void toi_free_page(int fail_num, unsigned long virt)
3744 +{
3745 +       if (virt && toi_alloc_ops.enabled)
3746 +               free_update_stats(fail_num);
3747 +
3748 +       free_page(virt);
3749 +}
3750 +
3751 +void toi__free_page(int fail_num, struct page *page)
3752 +{
3753 +       if (page && toi_alloc_ops.enabled)
3754 +               free_update_stats(fail_num);
3755 +
3756 +       __free_page(page);
3757 +}
3758 +
3759 +void toi_free_pages(int fail_num, struct page *page, int order)
3760 +{
3761 +       if (page && toi_alloc_ops.enabled)
3762 +               free_update_stats(fail_num);
3763 +
3764 +       __free_pages(page, order);
3765 +}
3766 +
3767 +void toi_alloc_print_debug_stats(void)
3768 +{
3769 +       int i, header_done = 0;
3770 +
3771 +       if (!toi_alloc_ops.enabled)
3772 +               return;
3773 +
3774 +       for (i = 0; i < TOI_ALLOC_PATHS; i++)
3775 +               if (atomic_read(&toi_alloc_count[i]) !=
3776 +                   atomic_read(&toi_free_count[i])) {
3777 +                       if (!header_done) {
3778 +                               printk(KERN_INFO "Idx  Allocs   Frees   Tests "
3779 +                                       "  Fails Max     Description\n");
3780 +                               header_done = 1;
3781 +                       }
3782 +
3783 +                       printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
3784 +                               atomic_read(&toi_alloc_count[i]),
3785 +                               atomic_read(&toi_free_count[i]),
3786 +                               atomic_read(&toi_test_count[i]),
3787 +                               atomic_read(&toi_fail_count[i]),
3788 +                               toi_max_allocd[i],
3789 +                               toi_alloc_desc[i]);
3790 +               }
3791 +}
3792 +EXPORT_SYMBOL_GPL(toi_alloc_print_debug_stats);
3793 +
3794 +static int toi_alloc_initialise(int starting_cycle)
3795 +{
3796 +       int i;
3797 +
3798 +       if (starting_cycle && toi_alloc_ops.enabled) {
3799 +               for (i = 0; i < TOI_ALLOC_PATHS; i++) {
3800 +                       atomic_set(&toi_alloc_count[i], 0);
3801 +                       atomic_set(&toi_free_count[i], 0);
3802 +                       atomic_set(&toi_test_count[i], 0);
3803 +                       atomic_set(&toi_fail_count[i], 0);
3804 +                       toi_cur_allocd[i] = 0;
3805 +                       toi_max_allocd[i] = 0;
3806 +               };
3807 +               max_allocd = 0;
3808 +               cur_allocd = 0;
3809 +       }
3810 +
3811 +       return 0;
3812 +}
3813 +
3814 +static struct toi_sysfs_data sysfs_params[] = {
3815 +       { TOI_ATTR("failure_test", SYSFS_RW),
3816 +         SYSFS_INT(&toi_fail_num, 0, 99, 0)
3817 +       },
3818 +
3819 +       { TOI_ATTR("find_max_mem_allocated", SYSFS_RW),
3820 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_GET_MAX_MEM_ALLOCD, 0)
3821 +       },
3822 +
3823 +       { TOI_ATTR("enabled", SYSFS_RW),
3824 +         SYSFS_INT(&toi_alloc_ops.enabled, 0, 1, 0)
3825 +       }
3826 +};
3827 +
3828 +static struct toi_module_ops toi_alloc_ops = {
3829 +       .type                                   = MISC_HIDDEN_MODULE,
3830 +       .name                                   = "allocation debugging",
3831 +       .directory                              = "alloc",
3832 +       .module                                 = THIS_MODULE,
3833 +       .early                                  = 1,
3834 +       .initialise                             = toi_alloc_initialise,
3835 +
3836 +       .sysfs_data             = sysfs_params,
3837 +       .num_sysfs_entries      = sizeof(sysfs_params) /
3838 +               sizeof(struct toi_sysfs_data),
3839 +};
3840 +
3841 +int toi_alloc_init(void)
3842 +{
3843 +       int result = toi_register_module(&toi_alloc_ops);
3844 +       toi_alloc_ops.enabled = 0;
3845 +       return result;
3846 +}
3847 +
3848 +void toi_alloc_exit(void)
3849 +{
3850 +       toi_unregister_module(&toi_alloc_ops);
3851 +}
3852 +#ifdef CONFIG_TOI_EXPORTS
3853 +EXPORT_SYMBOL_GPL(toi_kzalloc);
3854 +EXPORT_SYMBOL_GPL(toi_get_free_pages);
3855 +EXPORT_SYMBOL_GPL(toi_get_zeroed_page);
3856 +EXPORT_SYMBOL_GPL(toi_kfree);
3857 +EXPORT_SYMBOL_GPL(toi_free_page);
3858 +EXPORT_SYMBOL_GPL(toi__free_page);
3859 +EXPORT_SYMBOL_GPL(toi_alloc_page);
3860 +#endif
3861 +#endif
3862 diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
3863 new file mode 100644
3864 index 0000000..146c2bd
3865 --- /dev/null
3866 +++ b/kernel/power/tuxonice_alloc.h
3867 @@ -0,0 +1,51 @@
3868 +/*
3869 + * kernel/power/tuxonice_alloc.h
3870 + *
3871 + * Copyright (C) 2007 Nigel Cunningham (nigel at tuxonice net)
3872 + *
3873 + * This file is released under the GPLv2.
3874 + *
3875 + */
3876 +
3877 +#define TOI_WAIT_GFP (GFP_KERNEL | __GFP_NOWARN)
3878 +#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
3879 +
3880 +#ifdef CONFIG_PM_DEBUG
3881 +extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
3882 +extern void toi_kfree(int fail_num, const void *arg);
3883 +
3884 +extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
3885 +               unsigned int order);
3886 +#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
3887 +extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
3888 +extern void toi_free_page(int fail_num, unsigned long buf);
3889 +extern void toi__free_page(int fail_num, struct page *page);
3890 +extern void toi_free_pages(int fail_num, struct page *page, int order);
3891 +extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
3892 +extern int toi_alloc_init(void);
3893 +extern void toi_alloc_exit(void);
3894 +
3895 +extern void toi_alloc_print_debug_stats(void);
3896 +
3897 +#else /* CONFIG_PM_DEBUG */
3898 +
3899 +#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
3900 +#define toi_kfree(FAIL, ALLOCN) (kfree(ALLOCN))
3901 +
3902 +#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
3903 +#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
3904 +#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
3905 +#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
3906 +#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
3907 +#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
3908 +#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
3909 +static inline int toi_alloc_init(void)
3910 +{
3911 +       return 0;
3912 +}
3913 +
3914 +static inline void toi_alloc_exit(void) { }
3915 +
3916 +static inline void toi_alloc_print_debug_stats(void) { }
3917 +
3918 +#endif
3919 diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
3920 new file mode 100644
3921 index 0000000..e2f0c2a
3922 --- /dev/null
3923 +++ b/kernel/power/tuxonice_atomic_copy.c
3924 @@ -0,0 +1,384 @@
3925 +/*
3926 + * kernel/power/tuxonice_atomic_copy.c
3927 + *
3928 + * Copyright 2004-2007 Nigel Cunningham (nigel at tuxonice net)
3929 + * Copyright (C) 2006 Red Hat, inc.
3930 + *
3931 + * Distributed under GPLv2.
3932 + *
3933 + * Routines for doing the atomic save/restore.
3934 + */
3935 +
3936 +#include <linux/suspend.h>
3937 +#include <linux/highmem.h>
3938 +#include <linux/cpu.h>
3939 +#include <linux/freezer.h>
3940 +#include <linux/console.h>
3941 +#include "tuxonice.h"
3942 +#include "tuxonice_storage.h"
3943 +#include "tuxonice_power_off.h"
3944 +#include "tuxonice_ui.h"
3945 +#include "power.h"
3946 +#include "tuxonice_io.h"
3947 +#include "tuxonice_prepare_image.h"
3948 +#include "tuxonice_pageflags.h"
3949 +#include "tuxonice_checksum.h"
3950 +#include "tuxonice_builtin.h"
3951 +#include "tuxonice_atomic_copy.h"
3952 +#include "tuxonice_alloc.h"
3953 +
3954 +long extra_pd1_pages_used;
3955 +
3956 +/**
3957 + * free_pbe_list: Free page backup entries used by the atomic copy code.
3958 + *
3959 + * Normally, this function isn't used. If, however, we need to abort before
3960 + * doing the atomic copy, we use this to free the pbes previously allocated.
3961 + **/
3962 +static void free_pbe_list(struct pbe **list, int highmem)
3963 +{
3964 +       while (*list) {
3965 +               int i;
3966 +               struct pbe *free_pbe, *next_page = NULL;
3967 +               struct page *page;
3968 +
3969 +               if (highmem) {
3970 +                       page = (struct page *) *list;
3971 +                       free_pbe = (struct pbe *) kmap(page);
3972 +               } else {
3973 +                       page = virt_to_page(*list);
3974 +                       free_pbe = *list;
3975 +               }
3976 +
3977 +               for (i = 0; i < PBES_PER_PAGE; i++) {
3978 +                       if (!free_pbe)
3979 +                               break;
3980 +                       if (highmem)
3981 +                               toi__free_page(29, free_pbe->address);
3982 +                       else
3983 +                               toi_free_page(29,
3984 +                                       (unsigned long) free_pbe->address);
3985 +                       free_pbe = free_pbe->next;
3986 +               }
3987 +
3988 +               if (highmem) {
3989 +                       if (free_pbe)
3990 +                               next_page = free_pbe;
3991 +                       kunmap(page);
3992 +               } else {
3993 +                       if (free_pbe)
3994 +                               next_page = free_pbe;
3995 +               }
3996 +
3997 +               toi__free_page(29, page);
3998 +               *list = (struct pbe *) next_page;
3999 +       };
4000 +}
4001 +
4002 +/**
4003 + * copyback_post: Post atomic-restore actions.
4004 + *
4005 + * After doing the atomic restore, we have a few more things to do:
4006 + * 1) We want to retain some values across the restore, so we now copy
4007 + * these from the nosave variables to the normal ones.
4008 + * 2) Set the status flags.
4009 + * 3) Resume devices.
4010 + * 4) Tell userui so it can redraw & restore settings.
4011 + * 5) Reread the page cache.
4012 + **/
4013 +
4014 +void copyback_post(void)
4015 +{
4016 +       struct toi_boot_kernel_data *bkd =
4017 +               (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
4018 +
4019 +       /*
4020 +        * The boot kernel's data may be larger (newer version) or
4021 +        * smaller (older version) than ours. Copy the minimum
4022 +        * of the two sizes, so that we don't overwrite valid values
4023 +        * from pre-atomic copy.
4024 +        */
4025 +
4026 +       memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
4027 +                       min_t(int, sizeof(struct toi_boot_kernel_data),
4028 +                               bkd->size));
4029 +
4030 +       if (toi_activate_storage(1))
4031 +               panic("Failed to reactivate our storage.");
4032 +
4033 +       toi_ui_post_atomic_restore();
4034 +
4035 +       toi_cond_pause(1, "About to reload secondary pagedir.");
4036 +
4037 +       if (read_pageset2(0))
4038 +               panic("Unable to successfully reread the page cache.");
4039 +
4040 +       /*
4041 +        * If the user wants to sleep again after resuming from full-off,
4042 +        * it's most likely to be in order to suspend to ram, so we'll
4043 +        * do this check after loading pageset2, to give them the fastest
4044 +        * wakeup when they are ready to use the computer again.
4045 +        */
4046 +       toi_check_resleep();
4047 +}
4048 +
4049 +/**
4050 + * toi_copy_pageset1: Do the atomic copy of pageset1.
4051 + *
4052 + * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
4053 + * because we can't be sure what side effects it has. On my old Duron, with
4054 + * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
4055 + * count at resume time 4 instead of 3.
4056 + *
4057 + * We don't want to call kmap_atomic unconditionally because it has the side
4058 + * effect of incrementing the preempt count, which will leave it one too high
4059 + * post resume (the page containing the preempt count will be copied after
4060 + * its incremented. This is essentially the same problem.
4061 + **/
4062 +
4063 +void toi_copy_pageset1(void)
4064 +{
4065 +       int i;
4066 +       unsigned long source_index, dest_index;
4067 +
4068 +       source_index = get_next_bit_on(&pageset1_map, max_pfn + 1);
4069 +       dest_index = get_next_bit_on(&pageset1_copy_map, max_pfn + 1);
4070 +
4071 +       for (i = 0; i < pagedir1.size; i++) {
4072 +               unsigned long *origvirt, *copyvirt;
4073 +               struct page *origpage, *copypage;
4074 +               int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
4075 +                   was_present;
4076 +
4077 +               origpage = pfn_to_page(source_index);
4078 +               copypage = pfn_to_page(dest_index);
4079 +
4080 +               origvirt = PageHighMem(origpage) ?
4081 +                       kmap_atomic(origpage, KM_USER0) :
4082 +                       page_address(origpage);
4083 +
4084 +               copyvirt = PageHighMem(copypage) ?
4085 +                       kmap_atomic(copypage, KM_USER1) :
4086 +                       page_address(copypage);
4087 +
4088 +               was_present = kernel_page_present(origpage);
4089 +               if (!was_present)
4090 +                       kernel_map_pages(origpage, 1, 1);
4091 +
4092 +               while (loop >= 0) {
4093 +                       *(copyvirt + loop) = *(origvirt + loop);
4094 +                       loop--;
4095 +               }
4096 +
4097 +               if (!was_present)
4098 +                       kernel_map_pages(origpage, 1, 0);
4099 +
4100 +               if (PageHighMem(origpage))
4101 +                       kunmap_atomic(origvirt, KM_USER0);
4102 +
4103 +               if (PageHighMem(copypage))
4104 +                       kunmap_atomic(copyvirt, KM_USER1);
4105 +
4106 +               source_index = get_next_bit_on(&pageset1_map, source_index);
4107 +               dest_index = get_next_bit_on(&pageset1_copy_map, dest_index);
4108 +       }
4109 +}
4110 +
4111 +/**
4112 + * __toi_post_context_save: Steps after saving the cpu context.
4113 + *
4114 + * Steps taken after saving the CPU state to make the actual
4115 + * atomic copy.
4116 + *
4117 + * Called from swsusp_save in snapshot.c via toi_post_context_save.
4118 + **/
4119 +
4120 +int __toi_post_context_save(void)
4121 +{
4122 +       long old_ps1_size = pagedir1.size;
4123 +
4124 +       check_checksums();
4125 +
4126 +       free_checksum_pages();
4127 +
4128 +       toi_recalculate_image_contents(1);
4129 +
4130 +       extra_pd1_pages_used = pagedir1.size - old_ps1_size;
4131 +
4132 +       if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
4133 +               printk(KERN_INFO "Pageset1 has grown by %ld pages. "
4134 +                       "extra_pages_allowance is currently only %lu.\n",
4135 +                       pagedir1.size - old_ps1_size,
4136 +                       extra_pd1_pages_allowance);
4137 +               set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
4138 +               return -1;
4139 +       }
4140 +
4141 +       if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
4142 +           !test_action_state(TOI_TEST_BIO))
4143 +               toi_copy_pageset1();
4144 +
4145 +       return 0;
4146 +}
4147 +
4148 +/**
4149 + * toi_hibernate: High level code for doing the atomic copy.
4150 + *
4151 + * High-level code which prepares to do the atomic copy. Loosely based
4152 + * on the swsusp version, but with the following twists:
4153 + * - We set toi_running so the swsusp code uses our code paths.
4154 + * - We give better feedback regarding what goes wrong if there is a problem.
4155 + * - We use an extra function to call the assembly, just in case this code
4156 + *   is in a module (return address).
4157 + **/
4158 +
4159 +int toi_hibernate(void)
4160 +{
4161 +       int error;
4162 +
4163 +       toi_running = 1; /* For the swsusp code we use :< */
4164 +
4165 +       error = toi_lowlevel_builtin();
4166 +
4167 +       toi_running = 0;
4168 +       return error;
4169 +}
4170 +
4171 +/**
4172 + * toi_atomic_restore: Prepare to do the atomic restore.
4173 + *
4174 + * Get ready to do the atomic restore. This part gets us into the same
4175 + * state we are in prior to do calling do_toi_lowlevel while
4176 + * hibernating: hot-unplugging secondary cpus and freeze processes,
4177 + * before starting the thread that will do the restore.
4178 + **/
4179 +
4180 +int toi_atomic_restore(void)
4181 +{
4182 +       int error;
4183 +
4184 +       toi_running = 1;
4185 +
4186 +       toi_prepare_status(DONT_CLEAR_BAR,      "Atomic restore.");
4187 +
4188 +       if (add_boot_kernel_data_pbe())
4189 +               goto Failed;
4190 +
4191 +       if (toi_go_atomic(PMSG_QUIESCE, 0))
4192 +               goto Failed;
4193 +
4194 +       /* We'll ignore saved state, but this gets preempt count (etc) right */
4195 +       save_processor_state();
4196 +
4197 +       error = swsusp_arch_resume();
4198 +       /*
4199 +        * Code below is only ever reached in case of failure. Otherwise
4200 +        * execution continues at place where swsusp_arch_suspend was called.
4201 +        *
4202 +        * We don't know whether it's safe to continue (this shouldn't happen),
4203 +        * so lets err on the side of caution.
4204 +        */
4205 +       BUG();
4206 +
4207 +Failed:
4208 +       free_pbe_list(&restore_pblist, 0);
4209 +#ifdef CONFIG_HIGHMEM
4210 +       free_pbe_list(&restore_highmem_pblist, 1);
4211 +#endif
4212 +       if (test_action_state(TOI_PM_PREPARE_CONSOLE))
4213 +               pm_restore_console();
4214 +       toi_running = 0;
4215 +       return 1;
4216 +}
4217 +
4218 +int toi_go_atomic(pm_message_t state, int suspend_time)
4219 +{
4220 +       toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
4221 +
4222 +       if (suspend_time && toi_platform_begin()) {
4223 +               set_abort_result(TOI_PLATFORM_PREP_FAILED);
4224 +               toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 0);
4225 +               return 1;
4226 +       }
4227 +
4228 +       suspend_console();
4229 +
4230 +       if (device_suspend(state)) {
4231 +               set_abort_result(TOI_DEVICE_REFUSED);
4232 +               toi_end_atomic(ATOMIC_STEP_RESUME_CONSOLE, suspend_time, 1);
4233 +               return 1;
4234 +       }
4235 +
4236 +       if (suspend_time && toi_platform_pre_snapshot()) {
4237 +               set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
4238 +               toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 0);
4239 +               return 1;
4240 +       }
4241 +
4242 +       if (!suspend_time && toi_platform_pre_restore()) {
4243 +               set_abort_result(TOI_PRE_RESTORE_FAILED);
4244 +               toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 0);
4245 +               return 1;
4246 +       }
4247 +
4248 +       if (test_action_state(TOI_LATE_CPU_HOTPLUG)) {
4249 +               if (disable_nonboot_cpus()) {
4250 +                       set_abort_result(TOI_CPU_HOTPLUG_FAILED);
4251 +                       toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
4252 +                                       suspend_time, 0);
4253 +                       return 1;
4254 +               }
4255 +       }
4256 +
4257 +       if (suspend_time && arch_prepare_suspend()) {
4258 +               set_abort_result(TOI_ARCH_PREPARE_FAILED);
4259 +               toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, suspend_time, 0);
4260 +               return 1;
4261 +       }
4262 +
4263 +       device_pm_lock();
4264 +       local_irq_disable();
4265 +
4266 +       /* At this point, device_suspend() has been called, but *not*
4267 +        * device_power_down(). We *must* device_power_down() now.
4268 +        * Otherwise, drivers for some devices (e.g. interrupt controllers)
4269 +        * become desynchronized with the actual state of the hardware
4270 +        * at resume time, and evil weirdness ensues.
4271 +        */
4272 +
4273 +       if (device_power_down(state)) {
4274 +               set_abort_result(TOI_DEVICE_REFUSED);
4275 +               toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 0);
4276 +               return 1;
4277 +       }
4278 +
4279 +       return 0;
4280 +}
4281 +
4282 +void toi_end_atomic(int stage, int suspend_time, int error)
4283 +{
4284 +       switch (stage) {
4285 +       case ATOMIC_ALL_STEPS:
4286 +               if (!suspend_time)
4287 +                       toi_platform_leave();
4288 +               device_power_up(error ? PMSG_RECOVER :
4289 +                       (suspend_time ? PMSG_THAW : PMSG_RESTORE));
4290 +       case ATOMIC_STEP_IRQS:
4291 +               local_irq_enable();
4292 +               device_pm_unlock();
4293 +       case ATOMIC_STEP_CPU_HOTPLUG:
4294 +               if (test_action_state(TOI_LATE_CPU_HOTPLUG))
4295 +                       enable_nonboot_cpus();
4296 +       case ATOMIC_STEP_PLATFORM_FINISH:
4297 +               toi_platform_finish();
4298 +       case ATOMIC_STEP_DEVICE_RESUME:
4299 +               device_resume(error ? PMSG_RECOVER :
4300 +                       (suspend_time ? PMSG_THAW : PMSG_RESTORE));
4301 +       case ATOMIC_STEP_RESUME_CONSOLE:
4302 +               resume_console();
4303 +       case ATOMIC_STEP_PLATFORM_END:
4304 +               toi_platform_end();
4305 +
4306 +               toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
4307 +       }
4308 +}
4309 diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
4310 new file mode 100644
4311 index 0000000..7ec08a8
4312 --- /dev/null
4313 +++ b/kernel/power/tuxonice_atomic_copy.h
4314 @@ -0,0 +1,22 @@
4315 +/*
4316 + * kernel/power/tuxonice_atomic_copy.h
4317 + *
4318 + * Copyright 2007 Nigel Cunningham (nigel at tuxonice net)
4319 + *
4320 + * Distributed under GPLv2.
4321 + *
4322 + * Routines for doing the atomic save/restore.
4323 + */
4324 +
4325 +enum {
4326 +       ATOMIC_ALL_STEPS,
4327 +       ATOMIC_STEP_IRQS,
4328 +       ATOMIC_STEP_CPU_HOTPLUG,
4329 +       ATOMIC_STEP_PLATFORM_FINISH,
4330 +       ATOMIC_STEP_DEVICE_RESUME,
4331 +       ATOMIC_STEP_RESUME_CONSOLE,
4332 +       ATOMIC_STEP_PLATFORM_END,
4333 +};
4334 +
4335 +int toi_go_atomic(pm_message_t state, int toi_time);
4336 +void toi_end_atomic(int stage, int toi_time, int error);
4337 diff --git a/kernel/power/tuxonice_block_io.c b/kernel/power/tuxonice_block_io.c
4338 new file mode 100644
4339 index 0000000..b018914
4340 --- /dev/null
4341 +++ b/kernel/power/tuxonice_block_io.c
4342 @@ -0,0 +1,1193 @@
4343 +/*
4344 + * kernel/power/tuxonice_block_io.c
4345 + *
4346 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
4347 + *
4348 + * Distributed under GPLv2.
4349 + *
4350 + * This file contains block io functions for TuxOnIce. These are
4351 + * used by the swapwriter and it is planned that they will also
4352 + * be used by the NFSwriter.
4353 + *
4354 + */
4355 +
4356 +#include <linux/blkdev.h>
4357 +#include <linux/syscalls.h>
4358 +#include <linux/suspend.h>
4359 +
4360 +#include "tuxonice.h"
4361 +#include "tuxonice_sysfs.h"
4362 +#include "tuxonice_modules.h"
4363 +#include "tuxonice_prepare_image.h"
4364 +#include "tuxonice_block_io.h"
4365 +#include "tuxonice_ui.h"
4366 +#include "tuxonice_alloc.h"
4367 +#include "tuxonice_io.h"
4368 +
4369 +
4370 +#if 0
4371 +static int pr_index;
4372 +
4373 +static inline void reset_pr_index(void)
4374 +{
4375 +       pr_index = 0;
4376 +}
4377 +
4378 +#define PR_DEBUG(a, b...) do { \
4379 +       if (pr_index < 20) \
4380 +               printk(a, ##b); \
4381 +} while (0)
4382 +
4383 +static inline void inc_pr_index(void)
4384 +{
4385 +       pr_index++;
4386 +}
4387 +#else
4388 +#define PR_DEBUG(a, b...) do { } while (0)
4389 +#define reset_pr_index() do { } while (0)
4390 +#define inc_pr_index do { } while (0)
4391 +#endif
4392 +
4393 +#define TARGET_OUTSTANDING_IO 16384
4394 +
4395 +#define MEASURE_MUTEX_CONTENTION
4396 +#ifndef MEASURE_MUTEX_CONTENTION
4397 +#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
4398 +#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
4399 +#else
4400 +unsigned long mutex_times[2][2][NR_CPUS];
4401 +#define my_mutex_lock(index, the_lock) do { \
4402 +       int have_mutex; \
4403 +       have_mutex = mutex_trylock(the_lock); \
4404 +       if (!have_mutex) { \
4405 +               mutex_lock(the_lock); \
4406 +               mutex_times[index][0][smp_processor_id()]++; \
4407 +       } else { \
4408 +               mutex_times[index][1][smp_processor_id()]++; \
4409 +       }
4410 +
4411 +#define my_mutex_unlock(index, the_lock) \
4412 +       mutex_unlock(the_lock); \
4413 +} while (0)
4414 +#endif
4415 +
4416 +static int target_outstanding_io = 1024;
4417 +static int max_outstanding_writes, max_outstanding_reads;
4418 +
4419 +static struct page *bio_queue_head, *bio_queue_tail;
4420 +static DEFINE_SPINLOCK(bio_queue_lock);
4421 +
4422 +static int free_mem_throttle;
4423 +static int more_readahead = 1;
4424 +static struct page *readahead_list_head, *readahead_list_tail;
4425 +
4426 +static struct page *waiting_on;
4427 +
4428 +static atomic_t toi_io_in_progress;
4429 +static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
4430 +
4431 +static int extra_page_forward;
4432 +
4433 +static int current_stream;
4434 +/* 0 = Header, 1 = Pageset1, 2 = Pageset2, 3 = End of PS1 */
4435 +struct hibernate_extent_iterate_saved_state toi_writer_posn_save[4];
4436 +
4437 +/* Pointer to current entry being loaded/saved. */
4438 +struct hibernate_extent_iterate_state toi_writer_posn;
4439 +
4440 +/* Not static, so that the allocators can setup and complete
4441 + * writing the header */
4442 +char *toi_writer_buffer;
4443 +int toi_writer_buffer_posn;
4444 +
4445 +static struct toi_bdev_info *toi_devinfo;
4446 +
4447 +DEFINE_MUTEX(toi_bio_mutex);
4448 +
4449 +static struct task_struct *toi_queue_flusher;
4450 +static int toi_bio_queue_flush_pages(int dedicated_thread);
4451 +
4452 +/**
4453 + * set_throttle: Set the point where we pause to avoid oom.
4454 + *
4455 + * Initially, this value is zero, but when we first fail to allocate memory,
4456 + * we set it (plus a buffer) and thereafter throttle i/o once that limit is
4457 + * reached.
4458 + */
4459 +
4460 +static void set_throttle(void)
4461 +{
4462 +       int new_throttle = nr_unallocated_buffer_pages() + 256;
4463 +
4464 +       if (new_throttle > free_mem_throttle)
4465 +               free_mem_throttle = new_throttle;
4466 +}
4467 +
4468 +#define NUM_REASONS 10
4469 +static atomic_t reasons[NUM_REASONS];
4470 +static char *reason_name[NUM_REASONS] = {
4471 +       "readahead not ready",
4472 +       "bio allocation",
4473 +       "io_struct allocation",
4474 +       "submit buffer",
4475 +       "synchronous I/O",
4476 +       "bio mutex when reading",
4477 +       "bio mutex when writing",
4478 +       "toi_bio_get_new_page",
4479 +       "memory low",
4480 +       "readahead buffer allocation"
4481 +};
4482 +
4483 +/**
4484 + * do_bio_wait: Wait for some TuxOnIce i/o to complete.
4485 + *
4486 + * Submit any I/O that's batched up (if we're not already doing
4487 + * that, schedule and clean up whatever we can.
4488 + */
4489 +static void do_bio_wait(int reason)
4490 +{
4491 +       struct page *was_waiting_on = waiting_on;
4492 +
4493 +       /* On SMP, waiting_on can be reset, so we make a copy */
4494 +       if (was_waiting_on) {
4495 +               if (PageLocked(was_waiting_on)) {
4496 +                       wait_on_page_bit(was_waiting_on, PG_locked);
4497 +                       atomic_inc(&reasons[reason]);
4498 +               }
4499 +       } else {
4500 +               atomic_inc(&reasons[reason]);
4501 +
4502 +               wait_event(num_in_progress_wait,
4503 +                       !atomic_read(&toi_io_in_progress) ||
4504 +                       nr_unallocated_buffer_pages() > free_mem_throttle);
4505 +       }
4506 +}
4507 +
4508 +static void throttle_if_memory_low(void)
4509 +{
4510 +       int free_pages = nr_unallocated_buffer_pages();
4511 +
4512 +       /* Getting low on memory and I/O is in progress? */
4513 +       while (unlikely(free_pages < free_mem_throttle) &&
4514 +                       atomic_read(&toi_io_in_progress)) {
4515 +               do_bio_wait(8);
4516 +               free_pages = nr_unallocated_buffer_pages();
4517 +       }
4518 +}
4519 +
4520 +/**
4521 + * toi_finish_all_io: Complete all outstanding i/o.
4522 + */
4523 +static void toi_finish_all_io(void)
4524 +{
4525 +       wait_event(num_in_progress_wait, !atomic_read(&toi_io_in_progress));
4526 +}
4527 +
4528 +/**
4529 + * toi_end_bio: bio completion function.
4530 + *
4531 + * @bio: bio that has completed.
4532 + * @err: Error value. Yes, like end_swap_bio_read, we ignore it.
4533 + *
4534 + * Function called by block driver from interrupt context when I/O is completed.
4535 + * Nearly the fs/buffer.c version, but we want to do our cleanup too. We only
4536 + * free pages if they were buffers used when writing the image.
4537 + */
4538 +static void toi_end_bio(struct bio *bio, int err)
4539 +{
4540 +       struct page *page = bio->bi_io_vec[0].bv_page;
4541 +
4542 +       BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
4543 +
4544 +       unlock_page(page);
4545 +       bio_put(bio);
4546 +
4547 +       if (waiting_on == page)
4548 +               waiting_on = NULL;
4549 +
4550 +       put_page(page);
4551 +
4552 +       if (bio->bi_private)
4553 +               toi__free_page((int) ((unsigned long) bio->bi_private) , page);
4554 +
4555 +       bio_put(bio);
4556 +
4557 +       atomic_dec(&toi_io_in_progress);
4558 +
4559 +       wake_up(&num_in_progress_wait);
4560 +}
4561 +
4562 +/**
4563 + *     submit - submit BIO request.
4564 + *     @writing: READ or WRITE.
4565 + *
4566 + *     Based on Patrick's pmdisk code from long ago:
4567 + *     "Straight from the textbook - allocate and initialize the bio.
4568 + *     If we're writing, make sure the page is marked as dirty.
4569 + *     Then submit it and carry on."
4570 + *
4571 + *     With a twist, though - we handle block_size != PAGE_SIZE.
4572 + *     Caller has already checked that our page is not fragmented.
4573 + */
4574 +static int submit(int writing, struct block_device *dev, sector_t first_block,
4575 +               struct page *page, int free_group)
4576 +{
4577 +       struct bio *bio = NULL;
4578 +       int cur_outstanding_io;
4579 +
4580 +       throttle_if_memory_low();
4581 +
4582 +       while (!bio) {
4583 +               bio = bio_alloc(TOI_ATOMIC_GFP, 1);
4584 +               if (!bio) {
4585 +                       set_throttle();
4586 +                       do_bio_wait(1);
4587 +               }
4588 +       }
4589 +
4590 +       bio->bi_bdev = dev;
4591 +       bio->bi_sector = first_block;
4592 +       bio->bi_private = (void *) ((unsigned long) free_group);
4593 +       bio->bi_end_io = toi_end_bio;
4594 +
4595 +       if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
4596 +               printk(KERN_INFO "ERROR: adding page to bio at %lld\n",
4597 +                               (unsigned long long) first_block);
4598 +               bio_put(bio);
4599 +               return -EFAULT;
4600 +       }
4601 +
4602 +       bio_get(bio);
4603 +
4604 +       cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
4605 +       if (writing) {
4606 +               if (cur_outstanding_io > max_outstanding_writes)
4607 +                       max_outstanding_writes = cur_outstanding_io;
4608 +       } else {
4609 +               if (cur_outstanding_io > max_outstanding_reads)
4610 +                       max_outstanding_reads = cur_outstanding_io;
4611 +       }
4612 +
4613 +
4614 +       if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) {
4615 +               /* Fake having done the hard work */
4616 +               set_bit(BIO_UPTODATE, &bio->bi_flags);
4617 +               toi_end_bio(bio, 0);
4618 +       } else
4619 +               submit_bio(writing | (1 << BIO_RW_SYNC), bio);
4620 +
4621 +       return 0;
4622 +}
4623 +
4624 +/**
4625 + * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
4626 + *
4627 + * @writing: Whether reading or writing.
4628 + * @bdev: The block device which we're using.
4629 + * @block0: The first sector we're reading or writing.
4630 + * @page: The page on which I/O is being done.
4631 + * @readahead_index: If doing readahead, the index (reset this flag when done).
4632 + * @syncio: Whether the i/o is being done synchronously.
4633 + *
4634 + * Prepare and start a read or write operation.
4635 + *
4636 + * Note that we always work with our own page. If writing, we might be given a
4637 + * compression buffer that will immediately be used to start compressing the
4638 + * next page. For reading, we do readahead and therefore don't know the final
4639 + * address where the data needs to go.
4640 + */
4641 +static int toi_do_io(int writing, struct block_device *bdev, long block0,
4642 +       struct page *page, int is_readahead, int syncio, int free_group)
4643 +{
4644 +       page->private = 0;
4645 +
4646 +       /* Do here so we don't race against toi_bio_get_next_page_read */
4647 +       lock_page(page);
4648 +
4649 +       if (is_readahead) {
4650 +               if (readahead_list_head)
4651 +                       readahead_list_tail->private = (unsigned long) page;
4652 +               else
4653 +                       readahead_list_head = page;
4654 +
4655 +               readahead_list_tail = page;
4656 +       }
4657 +
4658 +       /* Done before submitting to avoid races. */
4659 +       if (syncio)
4660 +               waiting_on = page;
4661 +
4662 +       /* Submit the page */
4663 +       get_page(page);
4664 +
4665 +       if (submit(writing, bdev, block0, page, free_group))
4666 +               return -EFAULT;
4667 +
4668 +       if (syncio)
4669 +               do_bio_wait(4);
4670 +
4671 +       return 0;
4672 +}
4673 +
4674 +/**
4675 + * toi_bdev_page_io: Simpler interface to do directly i/o on a single page.
4676 + *
4677 + * @writing: Whether reading or writing.
4678 + * @bdev: Block device on which we're operating.
4679 + * @pos: Sector at which page to read starts.
4680 + * @page: Page to be read/written.
4681 + *
4682 + * We used to use bread here, but it doesn't correctly handle
4683 + * blocksize != PAGE_SIZE. Now we create a submit_info to get the data we
4684 + * want and use our normal routines (synchronously).
4685 + */
4686 +static int toi_bdev_page_io(int writing, struct block_device *bdev,
4687 +               long pos, struct page *page)
4688 +{
4689 +       return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
4690 +}
4691 +
4692 +/**
4693 + * toi_bio_memory_needed: Report amount of memory needed for block i/o.
4694 + *
4695 + * We want to have at least enough memory so as to have target_outstanding_io
4696 + * or more transactions on the fly at once. If we can do more, fine.
4697 + */
4698 +static int toi_bio_memory_needed(void)
4699 +{
4700 +       return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
4701 +                               sizeof(struct bio));
4702 +}
4703 +
4704 +/*
4705 + * toi_bio_print_debug_stats
4706 + *
4707 + * Description:
4708 + */
4709 +static int toi_bio_print_debug_stats(char *buffer, int size)
4710 +{
4711 +       int len = snprintf_used(buffer, size, "- Max outstanding reads %d. Max "
4712 +                       "writes %d.\n", max_outstanding_reads,
4713 +                       max_outstanding_writes);
4714 +
4715 +       len += snprintf_used(buffer + len, size - len,
4716 +               "  Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
4717 +               target_outstanding_io,
4718 +               PAGE_SIZE, (unsigned int) sizeof(struct request),
4719 +               (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
4720 +
4721 +#ifdef MEASURE_MUTEX_CONTENTION
4722 +       {
4723 +       int i;
4724 +
4725 +       len += snprintf_used(buffer + len, size - len,
4726 +               "  Mutex contention while reading:\n  Contended      Free\n");
4727 +
4728 +       for_each_online_cpu(i)
4729 +               len += snprintf_used(buffer + len, size - len,
4730 +               "  %9lu %9lu\n",
4731 +               mutex_times[0][0][i], mutex_times[0][1][i]);
4732 +
4733 +       len += snprintf_used(buffer + len, size - len,
4734 +               "  Mutex contention while writing:\n  Contended      Free\n");
4735 +
4736 +       for_each_online_cpu(i)
4737 +               len += snprintf_used(buffer + len, size - len,
4738 +               "  %9lu %9lu\n",
4739 +               mutex_times[1][0][i], mutex_times[1][1][i]);
4740 +
4741 +       }
4742 +#endif
4743 +
4744 +       return len + snprintf_used(buffer + len, size - len,
4745 +               "  Free mem throttle point reached %d.\n", free_mem_throttle);
4746 +}
4747 +
4748 +/**
4749 + * toi_set_devinfo: Set the bdev info used for i/o.
4750 + *
4751 + * @info: Pointer to array of struct toi_bdev_info - the list of
4752 + * bdevs and blocks on them in which the image is stored.
4753 + *
4754 + * Set the list of bdevs and blocks in which the image will be stored.
4755 + * Sort of like putting a tape in the cassette player.
4756 + */
4757 +static void toi_set_devinfo(struct toi_bdev_info *info)
4758 +{
4759 +       toi_devinfo = info;
4760 +}
4761 +
4762 +/**
4763 + * dump_block_chains: Print the contents of the bdev info array.
4764 + */
4765 +static void dump_block_chains(void)
4766 +{
4767 +       int i;
4768 +
4769 +       for (i = 0; i < toi_writer_posn.num_chains; i++) {
4770 +               struct hibernate_extent *this;
4771 +
4772 +               this = (toi_writer_posn.chains + i)->first;
4773 +
4774 +               if (!this)
4775 +                       continue;
4776 +
4777 +               printk(KERN_INFO "Chain %d:", i);
4778 +
4779 +               while (this) {
4780 +                       printk(" [%lu-%lu]%s", this->start,
4781 +                                       this->end, this->next ? "," : "");
4782 +                       this = this->next;
4783 +               }
4784 +
4785 +               printk("\n");
4786 +       }
4787 +
4788 +       for (i = 0; i < 4; i++)
4789 +               printk(KERN_INFO "Posn %d: Chain %d, extent %d, offset %lu.\n",
4790 +                               i, toi_writer_posn_save[i].chain_num,
4791 +                               toi_writer_posn_save[i].extent_num,
4792 +                               toi_writer_posn_save[i].offset);
4793 +}
4794 +
4795 +/**
4796 + * go_next_page: Skip blocks to the start of the next page.
4797 + *
4798 + * Go forward one page, or two if extra_page_forward is set. It only gets
4799 + * set at the start of reading the image header, to skip the first page
4800 + * of the header, which is read without using the extent chains.
4801 + */
4802 +static int go_next_page(int writing)
4803 +{
4804 +       int i, max = (toi_writer_posn.current_chain == -1) ? 1 :
4805 +         toi_devinfo[toi_writer_posn.current_chain].blocks_per_page;
4806 +
4807 +       for (i = 0; i < max; i++)
4808 +               toi_extent_state_next(&toi_writer_posn);
4809 +
4810 +       if (toi_extent_state_eof(&toi_writer_posn)) {
4811 +               /* Don't complain if readahead falls off the end */
4812 +               if (writing) {
4813 +                       printk(KERN_INFO "Extent state eof. "
4814 +                               "Expected compression ratio too optimistic?\n");
4815 +                       dump_block_chains();
4816 +               }
4817 +               return -ENODATA;
4818 +       }
4819 +
4820 +       if (extra_page_forward) {
4821 +               extra_page_forward = 0;
4822 +               return go_next_page(writing);
4823 +       }
4824 +
4825 +       return 0;
4826 +}
4827 +
4828 +/**
4829 + * set_extra_page_forward: Make us skip an extra page on next go_next_page.
4830 + *
4831 + * Used in reading header, to jump to 2nd page after getting 1st page
4832 + * direct from image header.
4833 + */
4834 +static void set_extra_page_forward(void)
4835 +{
4836 +       extra_page_forward = 1;
4837 +}
4838 +
4839 +/**
4840 + * toi_bio_rw_page: Do i/o on the next disk page in the image.
4841 + *
4842 + * @writing: Whether reading or writing.
4843 + * @page: Page to do i/o on.
4844 + * @readahead_index: -1 or the index in the readahead ring.
4845 + *
4846 + * Submit a page for reading or writing, possibly readahead.
4847 + */
4848 +static int toi_bio_rw_page(int writing, struct page *page,
4849 +               int is_readahead, int free_group)
4850 +{
4851 +       struct toi_bdev_info *dev_info;
4852 +       int result;
4853 +
4854 +       if (go_next_page(writing)) {
4855 +               printk(KERN_INFO "Failed to advance a page in the extent "
4856 +                               "data.\n");
4857 +               return -ENODATA;
4858 +       }
4859 +
4860 +       if (current_stream == 0 && writing &&
4861 +               toi_writer_posn.current_chain ==
4862 +                       toi_writer_posn_save[2].chain_num &&
4863 +               toi_writer_posn.current_offset ==
4864 +                       toi_writer_posn_save[2].offset) {
4865 +               dump_block_chains();
4866 +               BUG();
4867 +       }
4868 +
4869 +       dev_info = &toi_devinfo[toi_writer_posn.current_chain];
4870 +
4871 +       result = toi_do_io(writing, dev_info->bdev,
4872 +               toi_writer_posn.current_offset <<
4873 +                       dev_info->bmap_shift,
4874 +               page, is_readahead, 0, free_group);
4875 +
4876 +       if (result) {
4877 +               more_readahead = 0;
4878 +               return result;
4879 +       }
4880 +
4881 +       if (!writing) {
4882 +               int compare_to = 0;
4883 +
4884 +               switch (current_stream) {
4885 +               case 0:
4886 +                       compare_to = 2;
4887 +                       break;
4888 +               case 1:
4889 +                       compare_to = 3;
4890 +                       break;
4891 +               case 2:
4892 +                       compare_to = 1;
4893 +                       break;
4894 +               }
4895 +
4896 +               if (toi_writer_posn.current_chain ==
4897 +                               toi_writer_posn_save[compare_to].chain_num &&
4898 +                   toi_writer_posn.current_offset ==
4899 +                               toi_writer_posn_save[compare_to].offset)
4900 +                       more_readahead = 0;
4901 +       }
4902 +       return 0;
4903 +}
4904 +
4905 +/**
4906 + * toi_rw_init: Prepare to read or write a stream in the image.
4907 + *
4908 + * @writing: Whether reading or writing.
4909 + * @stream number: Section of the image being processed.
4910 + */
4911 +static int toi_rw_init(int writing, int stream_number)
4912 +{
4913 +       if (stream_number)
4914 +               toi_extent_state_restore(&toi_writer_posn,
4915 +                               &toi_writer_posn_save[stream_number]);
4916 +       else
4917 +               toi_extent_state_goto_start(&toi_writer_posn);
4918 +
4919 +       toi_writer_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
4920 +       toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
4921 +
4922 +       current_stream = stream_number;
4923 +
4924 +       reset_pr_index();
4925 +       more_readahead = 1;
4926 +
4927 +       return toi_writer_buffer ? 0 : -ENOMEM;
4928 +}
4929 +
4930 +/**
4931 + * toi_read_header_init: Prepare to read the image header.
4932 + *
4933 + * Reset readahead indices prior to starting to read a section of the image.
4934 + */
4935 +static void toi_read_header_init(void)
4936 +{
4937 +       toi_writer_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
4938 +       more_readahead = 1;
4939 +}
4940 +
4941 +/*
4942 + * toi_bio_queue_write
4943 + */
4944 +static void toi_bio_queue_write(char **full_buffer)
4945 +{
4946 +       struct page *page = virt_to_page(*full_buffer);
4947 +       unsigned long flags;
4948 +
4949 +       page->private = 0;
4950 +
4951 +       spin_lock_irqsave(&bio_queue_lock, flags);
4952 +       if (!bio_queue_head)
4953 +               bio_queue_head = page;
4954 +       else
4955 +               bio_queue_tail->private = (unsigned long) page;
4956 +
4957 +       bio_queue_tail = page;
4958 +
4959 +       spin_unlock_irqrestore(&bio_queue_lock, flags);
4960 +       wake_up(&toi_io_queue_flusher);
4961 +
4962 +       *full_buffer = NULL;
4963 +}
4964 +
4965 +/**
4966 + * toi_rw_cleanup: Cleanup after i/o.
4967 + *
4968 + * @writing: Whether we were reading or writing.
4969 + */
4970 +static int toi_rw_cleanup(int writing)
4971 +{
4972 +       int i;
4973 +
4974 +       if (writing) {
4975 +               int result;
4976 +
4977 +               if (toi_writer_buffer_posn)
4978 +                       toi_bio_queue_write(&toi_writer_buffer);
4979 +
4980 +               result = toi_bio_queue_flush_pages(0);
4981 +
4982 +               if (result)
4983 +                       return result;
4984 +
4985 +               if (current_stream == 2)
4986 +                       toi_extent_state_save(&toi_writer_posn,
4987 +                                       &toi_writer_posn_save[1]);
4988 +               else if (current_stream == 1)
4989 +                       toi_extent_state_save(&toi_writer_posn,
4990 +                                       &toi_writer_posn_save[3]);
4991 +       }
4992 +
4993 +       toi_finish_all_io();
4994 +
4995 +       while (readahead_list_head) {
4996 +               void *next = (void *) readahead_list_head->private;
4997 +               toi__free_page(12, readahead_list_head);
4998 +               readahead_list_head = next;
4999 +       }
5000 +
5001 +       readahead_list_tail = NULL;
5002 +
5003 +       if (!current_stream)
5004 +               return 0;
5005 +
5006 +       for (i = 0; i < NUM_REASONS; i++) {
5007 +               if (!atomic_read(&reasons[i]))
5008 +                       continue;
5009 +               printk(KERN_INFO "Waited for i/o due to %s %d times.\n",
5010 +                               reason_name[i], atomic_read(&reasons[i]));
5011 +               atomic_set(&reasons[i], 0);
5012 +       }
5013 +
5014 +       current_stream = 0;
5015 +       return 0;
5016 +}
5017 +
5018 +int toi_start_one_readahead(int dedicated_thread)
5019 +{
5020 +       char *buffer = NULL;
5021 +       int oom = 0;
5022 +
5023 +       throttle_if_memory_low();
5024 +
5025 +       while (!buffer) {
5026 +               buffer = (char *) toi_get_zeroed_page(12,
5027 +                               TOI_ATOMIC_GFP);
5028 +               if (!buffer) {
5029 +                       if (oom && !dedicated_thread)
5030 +                               return -EIO;
5031 +
5032 +                       oom = 1;
5033 +                       set_throttle();
5034 +                       do_bio_wait(9);
5035 +               }
5036 +       }
5037 +
5038 +       return toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
5039 +}
5040 +
5041 +/*
5042 + * toi_start_new_readahead
5043 + *
5044 + * Start readahead of image pages.
5045 + *
5046 + * No mutex needed because this is only ever called by one cpu.
5047 + */
5048 +static int toi_start_new_readahead(int dedicated_thread)
5049 +{
5050 +       int last_result, num_submitted = 0;
5051 +
5052 +       /* Start a new readahead? */
5053 +       if (!more_readahead)
5054 +               return 0;
5055 +
5056 +       do {
5057 +               int result = toi_start_one_readahead(dedicated_thread);
5058 +
5059 +               if (result == -EIO)
5060 +                       return 0;
5061 +               else
5062 +                       last_result = result;
5063 +
5064 +               if (last_result == -ENODATA)
5065 +                       more_readahead = 0;
5066 +
5067 +               if (!more_readahead && last_result) {
5068 +                       /*
5069 +                        * Don't complain about failing to do readahead past
5070 +                        * the end of storage.
5071 +                        */
5072 +                       if (last_result != -ENODATA)
5073 +                               printk(KERN_INFO
5074 +                                       "Begin read chunk returned %d.\n",
5075 +                                       last_result);
5076 +               } else
5077 +                       num_submitted++;
5078 +
5079 +       } while (more_readahead &&
5080 +                (dedicated_thread ||
5081 +                 (num_submitted < target_outstanding_io &&
5082 +                  atomic_read(&toi_io_in_progress) < target_outstanding_io)));
5083 +       return 0;
5084 +}
5085 +
5086 +static void bio_io_flusher(int writing)
5087 +{
5088 +
5089 +       if (writing)
5090 +               toi_bio_queue_flush_pages(1);
5091 +       else
5092 +               toi_start_new_readahead(1);
5093 +}
5094 +
5095 +/**
5096 + * toi_bio_get_next_page_read: Read a disk page with readahead.
5097 + *
5098 + * Read a page from disk, submitting readahead and cleaning up finished i/o
5099 + * while we wait for the page we're after.
5100 + */
5101 +static int toi_bio_get_next_page_read(int no_readahead)
5102 +{
5103 +       unsigned long *virt;
5104 +       struct page *next;
5105 +
5106 +       /*
5107 +        * When reading the second page of the header, we have to
5108 +        * delay submitting the read until after we've gotten the
5109 +        * extents out of the first page.
5110 +        */
5111 +       if (unlikely(no_readahead && toi_start_one_readahead(0))) {
5112 +               printk(KERN_INFO "No readahead and toi_start_one_readahead "
5113 +                               "returned non-zero.\n");
5114 +               return -EIO;
5115 +       }
5116 +
5117 +       /*
5118 +        * On SMP, we may need to wait for the first readahead
5119 +        * to be submitted.
5120 +        */
5121 +       if (unlikely(!readahead_list_head)) {
5122 +               BUG_ON(!more_readahead);
5123 +               do {
5124 +                       cpu_relax();
5125 +               } while (!readahead_list_head);
5126 +       }
5127 +
5128 +       if (PageLocked(readahead_list_head)) {
5129 +               waiting_on = readahead_list_head;
5130 +               do_bio_wait(0);
5131 +       }
5132 +
5133 +       virt = page_address(readahead_list_head);
5134 +       memcpy(toi_writer_buffer, virt, PAGE_SIZE);
5135 +
5136 +       next = (struct page *) readahead_list_head->private;
5137 +       toi__free_page(12, readahead_list_head);
5138 +       readahead_list_head = next;
5139 +       return 0;
5140 +}
5141 +
5142 +/*
5143 + * toi_bio_queue_flush_pages
5144 + */
5145 +
5146 +static int toi_bio_queue_flush_pages(int dedicated_thread)
5147 +{
5148 +       unsigned long flags;
5149 +       int result = 0;
5150 +
5151 +top:
5152 +       spin_lock_irqsave(&bio_queue_lock, flags);
5153 +       while (bio_queue_head) {
5154 +               struct page *page = bio_queue_head;
5155 +               bio_queue_head = (struct page *) page->private;
5156 +               if (bio_queue_tail == page)
5157 +                       bio_queue_tail = NULL;
5158 +               spin_unlock_irqrestore(&bio_queue_lock, flags);
5159 +               result = toi_bio_rw_page(WRITE, page, 0, 11);
5160 +               if (result)
5161 +                       return result;
5162 +               spin_lock_irqsave(&bio_queue_lock, flags);
5163 +       }
5164 +       spin_unlock_irqrestore(&bio_queue_lock, flags);
5165 +
5166 +       if (dedicated_thread) {
5167 +               wait_event(toi_io_queue_flusher, bio_queue_head ||
5168 +                               toi_bio_queue_flusher_should_finish);
5169 +               if (likely(!toi_bio_queue_flusher_should_finish))
5170 +                       goto top;
5171 +               toi_bio_queue_flusher_should_finish = 0;
5172 +       }
5173 +       return 0;
5174 +}
5175 +
5176 +/*
5177 + * toi_bio_get_new_page
5178 + */
5179 +static void toi_bio_get_new_page(char **full_buffer)
5180 +{
5181 +       throttle_if_memory_low();
5182 +
5183 +       while (!*full_buffer) {
5184 +               *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
5185 +               if (!*full_buffer) {
5186 +                       set_throttle();
5187 +                       do_bio_wait(7);
5188 +               }
5189 +       }
5190 +}
5191 +
5192 +/*
5193 + * toi_rw_buffer: Combine smaller buffers into PAGE_SIZE I/O.
5194 + *
5195 + * @writing: Bool - whether writing (or reading).
5196 + * @buffer: The start of the buffer to write or fill.
5197 + * @buffer_size: The size of the buffer to write or fill.
5198 + */
5199 +static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
5200 +               int no_readahead)
5201 +{
5202 +       int bytes_left = buffer_size;
5203 +
5204 +       while (bytes_left) {
5205 +               char *source_start = buffer + buffer_size - bytes_left;
5206 +               char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
5207 +               int capacity = PAGE_SIZE - toi_writer_buffer_posn;
5208 +               char *to = writing ? dest_start : source_start;
5209 +               char *from = writing ? source_start : dest_start;
5210 +
5211 +               if (bytes_left <= capacity) {
5212 +                       memcpy(to, from, bytes_left);
5213 +                       toi_writer_buffer_posn += bytes_left;
5214 +                       return 0;
5215 +               }
5216 +
5217 +               /* Complete this page and start a new one */
5218 +               memcpy(to, from, capacity);
5219 +               bytes_left -= capacity;
5220 +
5221 +               if (!writing) {
5222 +                       int result = toi_bio_get_next_page_read(no_readahead);
5223 +                       if (result)
5224 +                               return result;
5225 +               } else {
5226 +                       toi_bio_queue_write(&toi_writer_buffer);
5227 +                       toi_bio_get_new_page(&toi_writer_buffer);
5228 +               }
5229 +
5230 +               toi_writer_buffer_posn = 0;
5231 +               toi_cond_pause(0, NULL);
5232 +       }
5233 +
5234 +       return 0;
5235 +}
5236 +
5237 +/**
5238 + * toi_bio_read_page - read a page of the image.
5239 + *
5240 + * @pfn: The pfn where the data belongs.
5241 + * @buffer_page: The page containing the (possibly compressed) data.
5242 + * @buf_size: The number of bytes on @buffer_page used.
5243 + *
5244 + * Read a (possibly compressed) page from the image, into buffer_page,
5245 + * returning its pfn and the buffer size.
5246 + */
5247 +static int toi_bio_read_page(unsigned long *pfn, struct page *buffer_page,
5248 +               unsigned int *buf_size)
5249 +{
5250 +       int result = 0;
5251 +       char *buffer_virt = kmap(buffer_page);
5252 +
5253 +       inc_pr_index;
5254 +
5255 +       /* Only call start_new_readahead if we don't have a dedicated thread */
5256 +       if (current == toi_queue_flusher && toi_start_new_readahead(0)) {
5257 +               printk(KERN_INFO "Queue flusher and toi_start_one_readahead "
5258 +                               "returned non-zero.\n");
5259 +               return -EIO;
5260 +       }
5261 +
5262 +       my_mutex_lock(0, &toi_bio_mutex);
5263 +
5264 +       if (toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
5265 +           toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
5266 +           toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
5267 +               abort_hibernate(TOI_FAILED_IO, "Read of data failed.");
5268 +               result = 1;
5269 +       } else
5270 +               PR_DEBUG("%d: PFN %ld, %d bytes.\n", pr_index, *pfn, *buf_size);
5271 +
5272 +       my_mutex_unlock(0, &toi_bio_mutex);
5273 +       kunmap(buffer_page);
5274 +       return result;
5275 +}
5276 +
5277 +/**
5278 + * toi_bio_write_page - Write a page of the image.
5279 + *
5280 + * @pfn: The pfn where the data belongs.
5281 + * @buffer_page: The page containing the (possibly compressed) data.
5282 + * @buf_size: The number of bytes on @buffer_page used.
5283 + *
5284 + * Write a (possibly compressed) page to the image from the buffer, together
5285 + * with it's index and buffer size.
5286 + */
5287 +static int toi_bio_write_page(unsigned long pfn, struct page *buffer_page,
5288 +               unsigned int buf_size)
5289 +{
5290 +       char *buffer_virt;
5291 +       int result = 0, result2 = 0;
5292 +
5293 +       inc_pr_index;
5294 +
5295 +       if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
5296 +               return 0;
5297 +
5298 +       my_mutex_lock(1, &toi_bio_mutex);
5299 +       buffer_virt = kmap(buffer_page);
5300 +
5301 +       if (toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
5302 +           toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
5303 +           toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
5304 +               printk(KERN_INFO "toi_rw_buffer returned non-zero to "
5305 +                               "toi_bio_write_page.\n");
5306 +               result = -EIO;
5307 +       }
5308 +
5309 +       PR_DEBUG("%d: Index %ld, %d bytes. Result %d.\n", pr_index, pfn,
5310 +                       buf_size, result);
5311 +
5312 +       kunmap(buffer_page);
5313 +       my_mutex_unlock(1, &toi_bio_mutex);
5314 +
5315 +       if (current == toi_queue_flusher)
5316 +               result2 = toi_bio_queue_flush_pages(0);
5317 +
5318 +       return result ? result : result2;
5319 +}
5320 +
5321 +/**
5322 + * toi_rw_header_chunk: Read or write a portion of the image header.
5323 + *
5324 + * @writing: Whether reading or writing.
5325 + * @owner: The module for which we're writing. Used for confirming that modules
5326 + * don't use more header space than they asked for.
5327 + * @buffer: Address of the data to write.
5328 + * @buffer_size: Size of the data buffer.
5329 + * @no_readahead: Don't try to start readhead (when still getting extents)
5330 + */
5331 +static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
5332 +               char *buffer, int buffer_size, int no_readahead)
5333 +{
5334 +       int result = 0;
5335 +
5336 +       if (owner) {
5337 +               owner->header_used += buffer_size;
5338 +               toi_message(TOI_HEADER, TOI_LOW, 1,
5339 +                       "Header: %s : %d bytes (%d/%d).\n",
5340 +                       buffer_size, owner->header_used,
5341 +                       owner->header_requested);
5342 +               if (owner->header_used > owner->header_requested) {
5343 +                       printk(KERN_EMERG "TuxOnIce module %s is using more "
5344 +                               "header space (%u) than it requested (%u).\n",
5345 +                               owner->name,
5346 +                               owner->header_used,
5347 +                               owner->header_requested);
5348 +                       return buffer_size;
5349 +               }
5350 +       } else
5351 +               toi_message(TOI_HEADER, TOI_LOW, 1,
5352 +                       "Header: (No owner): %d bytes.\n", buffer_size);
5353 +
5354 +       if (!writing && !no_readahead)
5355 +               result = toi_start_new_readahead(0);
5356 +
5357 +       if (!result)
5358 +               result = toi_rw_buffer(writing, buffer, buffer_size,
5359 +                               no_readahead);
5360 +
5361 +       return result;
5362 +}
5363 +
5364 +static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
5365 +               char *buffer, int size)
5366 +{
5367 +       return _toi_rw_header_chunk(writing, owner, buffer, size, 0);
5368 +}
5369 +
5370 +static int toi_rw_header_chunk_noreadahead(int writing,
5371 +               struct toi_module_ops *owner, char *buffer, int size)
5372 +{
5373 +       return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
5374 +}
5375 +
5376 +/**
5377 + * write_header_chunk_finish: Flush any buffered header data.
5378 + */
5379 +static int write_header_chunk_finish(void)
5380 +{
5381 +       int result = 0;
5382 +
5383 +       if (toi_writer_buffer_posn)
5384 +               toi_bio_queue_write(&toi_writer_buffer);
5385 +
5386 +       toi_bio_queue_flush_pages(0);
5387 +       toi_finish_all_io();
5388 +
5389 +       return result;
5390 +}
5391 +
5392 +/**
5393 + * toi_bio_storage_needed: Get the amount of storage needed for my fns.
5394 + */
5395 +static int toi_bio_storage_needed(void)
5396 +{
5397 +       return 2 * sizeof(int);
5398 +}
5399 +
5400 +/**
5401 + * toi_bio_save_config_info: Save block i/o config to image header.
5402 + *
5403 + * @buf: PAGE_SIZE'd buffer into which data should be saved.
5404 + */
5405 +static int toi_bio_save_config_info(char *buf)
5406 +{
5407 +       int *ints = (int *) buf;
5408 +       ints[0] = target_outstanding_io;
5409 +       return sizeof(int);
5410 +}
5411 +
5412 +/**
5413 + * toi_bio_load_config_info: Restore block i/o config.
5414 + *
5415 + * @buf: Data to be reloaded.
5416 + * @size: Size of the buffer saved.
5417 + */
5418 +static void toi_bio_load_config_info(char *buf, int size)
5419 +{
5420 +       int *ints = (int *) buf;
5421 +       target_outstanding_io  = ints[0];
5422 +}
5423 +
5424 +/**
5425 + * toi_bio_initialise: Initialise bio code at start of some action.
5426 + *
5427 + * @starting_cycle: Whether starting a hibernation cycle, or just reading or
5428 + * writing a sysfs value.
5429 + */
5430 +static int toi_bio_initialise(int starting_cycle)
5431 +{
5432 +       if (starting_cycle) {
5433 +               max_outstanding_writes = 0;
5434 +               max_outstanding_reads = 0;
5435 +               toi_queue_flusher = current;
5436 +#ifdef MEASURE_MUTEX_CONTENTION
5437 +               {
5438 +               int i, j, k;
5439 +
5440 +               for (i = 0; i < 2; i++)
5441 +                       for (j = 0; j < 2; j++)
5442 +                               for_each_online_cpu(k)
5443 +                                       mutex_times[i][j][k] = 0;
5444 +               }
5445 +#endif
5446 +       }
5447 +
5448 +       return 0;
5449 +}
5450 +
5451 +/**
5452 + * toi_bio_cleanup: Cleanup after some action.
5453 + *
5454 + * @finishing_cycle: Whether completing a cycle.
5455 + */
5456 +static void toi_bio_cleanup(int finishing_cycle)
5457 +{
5458 +       if (toi_writer_buffer) {
5459 +               toi_free_page(11, (unsigned long) toi_writer_buffer);
5460 +               toi_writer_buffer = NULL;
5461 +       }
5462 +}
5463 +
5464 +struct toi_bio_ops toi_bio_ops = {
5465 +       .bdev_page_io = toi_bdev_page_io,
5466 +       .finish_all_io = toi_finish_all_io,
5467 +       .forward_one_page = go_next_page,
5468 +       .set_extra_page_forward = set_extra_page_forward,
5469 +       .set_devinfo = toi_set_devinfo,
5470 +       .read_page = toi_bio_read_page,
5471 +       .write_page = toi_bio_write_page,
5472 +       .rw_init = toi_rw_init,
5473 +       .rw_cleanup = toi_rw_cleanup,
5474 +       .read_header_init = toi_read_header_init,
5475 +       .rw_header_chunk = toi_rw_header_chunk,
5476 +       .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead,
5477 +       .write_header_chunk_finish = write_header_chunk_finish,
5478 +       .io_flusher = bio_io_flusher,
5479 +};
5480 +
5481 +static struct toi_sysfs_data sysfs_params[] = {
5482 +       { TOI_ATTR("target_outstanding_io", SYSFS_RW),
5483 +         SYSFS_INT(&target_outstanding_io, 0, TARGET_OUTSTANDING_IO, 0),
5484 +       }
5485 +};
5486 +
5487 +static struct toi_module_ops toi_blockwriter_ops = {
5488 +       .name                                   = "lowlevel i/o",
5489 +       .type                                   = MISC_HIDDEN_MODULE,
5490 +       .directory                              = "block_io",
5491 +       .module                                 = THIS_MODULE,
5492 +       .print_debug_info                       = toi_bio_print_debug_stats,
5493 +       .memory_needed                          = toi_bio_memory_needed,
5494 +       .storage_needed                         = toi_bio_storage_needed,
5495 +       .save_config_info                       = toi_bio_save_config_info,
5496 +       .load_config_info                       = toi_bio_load_config_info,
5497 +       .initialise                             = toi_bio_initialise,
5498 +       .cleanup                                = toi_bio_cleanup,
5499 +
5500 +       .sysfs_data             = sysfs_params,
5501 +       .num_sysfs_entries      = sizeof(sysfs_params) /
5502 +               sizeof(struct toi_sysfs_data),
5503 +};
5504 +
5505 +/**
5506 + * toi_block_io_load: Load time routine for block i/o module.
5507 + *
5508 + * Register block i/o ops and sysfs entries.
5509 + */
5510 +static __init int toi_block_io_load(void)
5511 +{
5512 +       return toi_register_module(&toi_blockwriter_ops);
5513 +}
5514 +
5515 +#if defined(CONFIG_TOI_FILE_EXPORTS) || defined(CONFIG_TOI_SWAP_EXPORTS)
5516 +EXPORT_SYMBOL_GPL(toi_writer_posn);
5517 +EXPORT_SYMBOL_GPL(toi_writer_posn_save);
5518 +EXPORT_SYMBOL_GPL(toi_writer_buffer);
5519 +EXPORT_SYMBOL_GPL(toi_writer_buffer_posn);
5520 +EXPORT_SYMBOL_GPL(toi_bio_ops);
5521 +#endif
5522 +#ifdef MODULE
5523 +static __exit void toi_block_io_unload(void)
5524 +{
5525 +       toi_unregister_module(&toi_blockwriter_ops);
5526 +}
5527 +
5528 +module_init(toi_block_io_load);
5529 +module_exit(toi_block_io_unload);
5530 +MODULE_LICENSE("GPL");
5531 +MODULE_AUTHOR("Nigel Cunningham");
5532 +MODULE_DESCRIPTION("TuxOnIce block io functions");
5533 +#else
5534 +late_initcall(toi_block_io_load);
5535 +#endif
5536 diff --git a/kernel/power/tuxonice_block_io.h b/kernel/power/tuxonice_block_io.h
5537 new file mode 100644
5538 index 0000000..54b72ca
5539 --- /dev/null
5540 +++ b/kernel/power/tuxonice_block_io.h
5541 @@ -0,0 +1,57 @@
5542 +/*
5543 + * kernel/power/tuxonice_block_io.h
5544 + *
5545 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
5546 + * Copyright (C) 2006 Red Hat, inc.
5547 + *
5548 + * Distributed under GPLv2.
5549 + *
5550 + * This file contains declarations for functions exported from
5551 + * tuxonice_block_io.c, which contains low level io functions.
5552 + */
5553 +
5554 +#include <linux/buffer_head.h>
5555 +#include "tuxonice_extent.h"
5556 +
5557 +struct toi_bdev_info {
5558 +       struct block_device *bdev;
5559 +       dev_t dev_t;
5560 +       int bmap_shift;
5561 +       int blocks_per_page;
5562 +};
5563 +
5564 +/*
5565 + * Our exported interface so the swapwriter and filewriter don't
5566 + * need these functions duplicated.
5567 + */
5568 +struct toi_bio_ops {
5569 +       int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
5570 +                       struct page *page);
5571 +       void (*check_io_stats) (void);
5572 +       void (*reset_io_stats) (void);
5573 +       void (*finish_all_io) (void);
5574 +       int (*forward_one_page) (int writing);
5575 +       void (*set_extra_page_forward) (void);
5576 +       void (*set_devinfo) (struct toi_bdev_info *info);
5577 +       int (*read_page) (unsigned long *index, struct page *buffer_page,
5578 +                       unsigned int *buf_size);
5579 +       int (*write_page) (unsigned long index, struct page *buffer_page,
5580 +                       unsigned int buf_size);
5581 +       void (*read_header_init) (void);
5582 +       int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
5583 +                       char *buffer, int buffer_size);
5584 +       int (*rw_header_chunk_noreadahead) (int rw,
5585 +                       struct toi_module_ops *owner,
5586 +                       char *buffer, int buffer_size);
5587 +       int (*write_header_chunk_finish) (void);
5588 +       int (*rw_init) (int rw, int stream_number);
5589 +       int (*rw_cleanup) (int rw);
5590 +       void (*io_flusher) (int rw);
5591 +};
5592 +
5593 +extern struct toi_bio_ops toi_bio_ops;
5594 +
5595 +extern char *toi_writer_buffer;
5596 +extern int toi_writer_buffer_posn;
5597 +extern struct hibernate_extent_iterate_saved_state toi_writer_posn_save[4];
5598 +extern struct hibernate_extent_iterate_state toi_writer_posn;
5599 diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
5600 new file mode 100644
5601 index 0000000..e5525d2
5602 --- /dev/null
5603 +++ b/kernel/power/tuxonice_builtin.c
5604 @@ -0,0 +1,400 @@
5605 +/*
5606 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
5607 + *
5608 + * This file is released under the GPLv2.
5609 + */
5610 +#include <linux/module.h>
5611 +#include <linux/resume-trace.h>
5612 +#include <linux/kernel.h>
5613 +#include <linux/swap.h>
5614 +#include <linux/syscalls.h>
5615 +#include <linux/bio.h>
5616 +#include <linux/root_dev.h>
5617 +#include <linux/freezer.h>
5618 +#include <linux/reboot.h>
5619 +#include <linux/writeback.h>
5620 +#include <linux/tty.h>
5621 +#include <linux/crypto.h>
5622 +#include <linux/cpu.h>
5623 +#include <linux/dyn_pageflags.h>
5624 +#include <linux/ctype.h>
5625 +#include "tuxonice_io.h"
5626 +#include "tuxonice.h"
5627 +#include "tuxonice_extent.h"
5628 +#include "tuxonice_block_io.h"
5629 +#include "tuxonice_netlink.h"
5630 +#include "tuxonice_prepare_image.h"
5631 +#include "tuxonice_ui.h"
5632 +#include "tuxonice_sysfs.h"
5633 +#include "tuxonice_pagedir.h"
5634 +#include "tuxonice_modules.h"
5635 +#include "tuxonice_builtin.h"
5636 +#include "tuxonice_power_off.h"
5637 +#include "power.h"
5638 +
5639 +/*
5640 + * Highmem related functions (x86 only).
5641 + */
5642 +
5643 +#ifdef CONFIG_HIGHMEM
5644 +
5645 +/**
5646 + * copyback_high: Restore highmem pages.
5647 + *
5648 + * Highmem data and pbe lists are/can be stored in highmem.
5649 + * The format is slightly different to the lowmem pbe lists
5650 + * used for the assembly code: the last pbe in each page is
5651 + * a struct page * instead of struct pbe *, pointing to the
5652 + * next page where pbes are stored (or NULL if happens to be
5653 + * the end of the list). Since we don't want to generate
5654 + * unnecessary deltas against swsusp code, we use a cast
5655 + * instead of a union.
5656 + **/
5657 +
5658 +static void copyback_high(void)
5659 +{
5660 +       struct page *pbe_page = (struct page *) restore_highmem_pblist;
5661 +       struct pbe *this_pbe, *first_pbe;
5662 +       unsigned long *origpage, *copypage;
5663 +       int pbe_index = 1;
5664 +
5665 +       if (!pbe_page)
5666 +               return;
5667 +
5668 +       this_pbe = (struct pbe *) kmap_atomic(pbe_page, KM_BOUNCE_READ);
5669 +       first_pbe = this_pbe;
5670 +
5671 +       while (this_pbe) {
5672 +               int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
5673 +
5674 +               origpage = kmap_atomic((struct page *) this_pbe->orig_address,
5675 +                       KM_BIO_DST_IRQ);
5676 +               copypage = kmap_atomic((struct page *) this_pbe->address,
5677 +                       KM_BIO_SRC_IRQ);
5678 +
5679 +               while (loop >= 0) {
5680 +                       *(origpage + loop) = *(copypage + loop);
5681 +                       loop--;
5682 +               }
5683 +
5684 +               kunmap_atomic(origpage, KM_BIO_DST_IRQ);
5685 +               kunmap_atomic(copypage, KM_BIO_SRC_IRQ);
5686 +
5687 +               if (!this_pbe->next)
5688 +                       break;
5689 +
5690 +               if (pbe_index < PBES_PER_PAGE) {
5691 +                       this_pbe++;
5692 +                       pbe_index++;
5693 +               } else {
5694 +                       pbe_page = (struct page *) this_pbe->next;
5695 +                       kunmap_atomic(first_pbe, KM_BOUNCE_READ);
5696 +                       if (!pbe_page)
5697 +                               return;
5698 +                       this_pbe = (struct pbe *) kmap_atomic(pbe_page,
5699 +                                       KM_BOUNCE_READ);
5700 +                       first_pbe = this_pbe;
5701 +                       pbe_index = 1;
5702 +               }
5703 +       }
5704 +       kunmap_atomic(first_pbe, KM_BOUNCE_READ);
5705 +}
5706 +
5707 +#else /* CONFIG_HIGHMEM */
5708 +void copyback_high(void) { }
5709 +#endif
5710 +
5711 +char toi_wait_for_keypress_dev_console(int timeout)
5712 +{
5713 +       int fd, this_timeout = 255;
5714 +       char key = '\0';
5715 +       struct termios t, t_backup;
5716 +
5717 +       /* We should be guaranteed /dev/console exists after populate_rootfs()
5718 +        * in init/main.c.
5719 +        */
5720 +       fd = sys_open("/dev/console", O_RDONLY, 0);
5721 +       if (fd < 0) {
5722 +               printk(KERN_INFO "Couldn't open /dev/console.\n");
5723 +               return key;
5724 +       }
5725 +
5726 +       if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
5727 +               goto out_close;
5728 +
5729 +       memcpy(&t_backup, &t, sizeof(t));
5730 +
5731 +       t.c_lflag &= ~(ISIG|ICANON|ECHO);
5732 +       t.c_cc[VMIN] = 0;
5733 +
5734 +new_timeout:
5735 +       if (timeout > 0) {
5736 +               this_timeout = timeout < 26 ? timeout : 25;
5737 +               timeout -= this_timeout;
5738 +               this_timeout *= 10;
5739 +       }
5740 +
5741 +       t.c_cc[VTIME] = this_timeout;
5742 +
5743 +       if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
5744 +               goto out_restore;
5745 +
5746 +       while (1) {
5747 +               if (sys_read(fd, &key, 1) <= 0) {
5748 +                       if (timeout)
5749 +                               goto new_timeout;
5750 +                       key = '\0';
5751 +                       break;
5752 +               }
5753 +               key = tolower(key);
5754 +               if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
5755 +                       if (key == 'c') {
5756 +                               set_toi_state(TOI_CONTINUE_REQ);
5757 +                               break;
5758 +                       } else if (key == ' ')
5759 +                               break;
5760 +               } else
5761 +                       break;
5762 +       }
5763 +
5764 +out_restore:
5765 +       sys_ioctl(fd, TCSETS, (long)&t_backup);
5766 +out_close:
5767 +       sys_close(fd);
5768 +
5769 +       return key;
5770 +}
5771 +
5772 +struct toi_boot_kernel_data toi_bkd __nosavedata
5773 +               __attribute__((aligned(PAGE_SIZE))) = {
5774 +       MY_BOOT_KERNEL_DATA_VERSION,
5775 +       0,
5776 +#ifdef CONFIG_TOI_REPLACE_SWSUSP
5777 +       (1 << TOI_REPLACE_SWSUSP) |
5778 +#endif
5779 +       (1 << TOI_NO_FLUSHER_THREAD) |
5780 +       (1 << TOI_PAGESET2_FULL) | (1 << TOI_LATE_CPU_HOTPLUG),
5781 +};
5782 +EXPORT_SYMBOL_GPL(toi_bkd);
5783 +
5784 +struct block_device *toi_open_by_devnum(dev_t dev, unsigned mode)
5785 +{
5786 +       struct block_device *bdev = bdget(dev);
5787 +       int err = -ENOMEM;
5788 +       int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY;
5789 +       flags |= O_NONBLOCK;
5790 +       if (bdev)
5791 +               err = blkdev_get(bdev, mode, flags);
5792 +       return err ? ERR_PTR(err) : bdev;
5793 +}
5794 +EXPORT_SYMBOL_GPL(toi_open_by_devnum);
5795 +
5796 +EXPORT_SYMBOL_GPL(toi_wait_for_keypress_dev_console);
5797 +EXPORT_SYMBOL_GPL(hibernation_platform_enter);
5798 +EXPORT_SYMBOL_GPL(platform_begin);
5799 +EXPORT_SYMBOL_GPL(platform_pre_snapshot);
5800 +EXPORT_SYMBOL_GPL(platform_leave);
5801 +EXPORT_SYMBOL_GPL(platform_end);
5802 +EXPORT_SYMBOL_GPL(platform_finish);
5803 +EXPORT_SYMBOL_GPL(platform_pre_restore);
5804 +EXPORT_SYMBOL_GPL(platform_restore_cleanup);
5805 +EXPORT_SYMBOL_GPL(power_kobj);
5806 +EXPORT_SYMBOL_GPL(pm_notifier_call_chain);
5807 +EXPORT_SYMBOL_GPL(init_swsusp_header);
5808 +
5809 +#ifdef CONFIG_ARCH_HIBERNATION_HEADER
5810 +EXPORT_SYMBOL_GPL(arch_hibernation_header_save);
5811 +EXPORT_SYMBOL_GPL(arch_hibernation_header_restore);
5812 +#endif
5813 +
5814 +#ifdef CONFIG_TOI_CORE_EXPORTS
5815 +#ifdef CONFIG_X86_64
5816 +EXPORT_SYMBOL_GPL(restore_processor_state);
5817 +EXPORT_SYMBOL_GPL(save_processor_state);
5818 +#endif
5819 +
5820 +EXPORT_SYMBOL_GPL(drop_pagecache);
5821 +EXPORT_SYMBOL_GPL(restore_pblist);
5822 +EXPORT_SYMBOL_GPL(pm_mutex);
5823 +EXPORT_SYMBOL_GPL(pm_restore_console);
5824 +EXPORT_SYMBOL_GPL(super_blocks);
5825 +EXPORT_SYMBOL_GPL(next_zone);
5826 +
5827 +EXPORT_SYMBOL_GPL(freeze_processes);
5828 +EXPORT_SYMBOL_GPL(thaw_processes);
5829 +EXPORT_SYMBOL_GPL(thaw_kernel_threads);
5830 +EXPORT_SYMBOL_GPL(shrink_all_memory);
5831 +EXPORT_SYMBOL_GPL(shrink_one_zone);
5832 +EXPORT_SYMBOL_GPL(saveable_page);
5833 +EXPORT_SYMBOL_GPL(swsusp_arch_suspend);
5834 +EXPORT_SYMBOL_GPL(swsusp_arch_resume);
5835 +EXPORT_SYMBOL_GPL(pm_prepare_console);
5836 +EXPORT_SYMBOL_GPL(follow_page);
5837 +EXPORT_SYMBOL_GPL(machine_halt);
5838 +EXPORT_SYMBOL_GPL(block_dump);
5839 +EXPORT_SYMBOL_GPL(unlink_lru_lists);
5840 +EXPORT_SYMBOL_GPL(relink_lru_lists);
5841 +EXPORT_SYMBOL_GPL(machine_power_off);
5842 +EXPORT_SYMBOL_GPL(suspend_devices_and_enter);
5843 +EXPORT_SYMBOL_GPL(first_online_pgdat);
5844 +EXPORT_SYMBOL_GPL(next_online_pgdat);
5845 +EXPORT_SYMBOL_GPL(machine_restart);
5846 +EXPORT_SYMBOL_GPL(saved_command_line);
5847 +EXPORT_SYMBOL_GPL(tasklist_lock);
5848 +#ifdef CONFIG_PM_SLEEP_SMP
5849 +EXPORT_SYMBOL_GPL(disable_nonboot_cpus);
5850 +EXPORT_SYMBOL_GPL(enable_nonboot_cpus);
5851 +#endif
5852 +#endif
5853 +
5854 +int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
5855 +
5856 +#ifdef CONFIG_TOI_USERUI_EXPORTS
5857 +EXPORT_SYMBOL_GPL(kmsg_redirect);
5858 +#endif
5859 +EXPORT_SYMBOL_GPL(toi_wait);
5860 +
5861 +#if defined(CONFIG_TOI_USERUI_EXPORTS) || defined(CONFIG_TOI_CORE_EXPORTS)
5862 +EXPORT_SYMBOL_GPL(console_printk);
5863 +#endif
5864 +#ifdef CONFIG_TOI_SWAP_EXPORTS /* TuxOnIce swap specific */
5865 +EXPORT_SYMBOL_GPL(sys_swapon);
5866 +EXPORT_SYMBOL_GPL(sys_swapoff);
5867 +EXPORT_SYMBOL_GPL(si_swapinfo);
5868 +EXPORT_SYMBOL_GPL(map_swap_page);
5869 +EXPORT_SYMBOL_GPL(get_swap_page);
5870 +EXPORT_SYMBOL_GPL(swap_free);
5871 +EXPORT_SYMBOL_GPL(get_swap_info_struct);
5872 +#endif
5873 +
5874 +#ifdef CONFIG_TOI_FILE_EXPORTS
5875 +/* TuxOnice file allocator specific support */
5876 +EXPORT_SYMBOL_GPL(sys_unlink);
5877 +EXPORT_SYMBOL_GPL(sys_mknod);
5878 +#endif
5879 +
5880 +/* Swap or file */
5881 +#if defined(CONFIG_TOI_FILE_EXPORTS) || defined(CONFIG_TOI_SWAP_EXPORTS)
5882 +EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
5883 +EXPORT_SYMBOL_GPL(name_to_dev_t);
5884 +#endif
5885 +
5886 +#if defined(CONFIG_TOI_FILE_EXPORTS) || defined(CONFIG_TOI_SWAP_EXPORTS) || \
5887 +       defined(CONFIG_TOI_CORE_EXPORTS)
5888 +EXPORT_SYMBOL_GPL(resume_file);
5889 +#endif
5890 +struct toi_core_fns *toi_core_fns;
5891 +EXPORT_SYMBOL_GPL(toi_core_fns);
5892 +
5893 +DECLARE_DYN_PAGEFLAGS(pageset1_map);
5894 +DECLARE_DYN_PAGEFLAGS(pageset1_copy_map);
5895 +EXPORT_SYMBOL_GPL(pageset1_map);
5896 +EXPORT_SYMBOL_GPL(pageset1_copy_map);
5897 +
5898 +unsigned long toi_result;
5899 +struct pagedir pagedir1 = {1};
5900 +
5901 +EXPORT_SYMBOL_GPL(toi_result);
5902 +EXPORT_SYMBOL_GPL(pagedir1);
5903 +
5904 +unsigned long toi_get_nonconflicting_page(void)
5905 +{
5906 +       return toi_core_fns->get_nonconflicting_page();
5907 +}
5908 +
5909 +int toi_post_context_save(void)
5910 +{
5911 +       return toi_core_fns->post_context_save();
5912 +}
5913 +
5914 +int toi_try_hibernate(int have_pmsem)
5915 +{
5916 +       if (!toi_core_fns)
5917 +               return -ENODEV;
5918 +
5919 +       return toi_core_fns->try_hibernate(have_pmsem);
5920 +}
5921 +
5922 +void toi_try_resume(void)
5923 +{
5924 +       if (toi_core_fns)
5925 +               toi_core_fns->try_resume();
5926 +       else
5927 +               printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
5928 +}
5929 +
5930 +int toi_lowlevel_builtin(void)
5931 +{
5932 +       int error = 0;
5933 +
5934 +       save_processor_state();
5935 +       error = swsusp_arch_suspend();
5936 +       if (error)
5937 +               printk(KERN_ERR "Error %d hibernating\n", error);
5938 +
5939 +       /* Restore control flow appears here */
5940 +       if (!toi_in_hibernate) {
5941 +               copyback_high();
5942 +               set_toi_state(TOI_NOW_RESUMING);
5943 +       }
5944 +
5945 +       restore_processor_state();
5946 +
5947 +       return error;
5948 +}
5949 +
5950 +EXPORT_SYMBOL_GPL(toi_lowlevel_builtin);
5951 +
5952 +unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
5953 +EXPORT_SYMBOL_GPL(toi_compress_bytes_in);
5954 +EXPORT_SYMBOL_GPL(toi_compress_bytes_out);
5955 +
5956 +unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
5957 +               (1 << TOI_IGNORE_LOGLEVEL) |
5958 +               (1 << TOI_IO_STOPPED));
5959 +EXPORT_SYMBOL_GPL(toi_state);
5960 +
5961 +/* The number of hibernates we have started (some may have been cancelled) */
5962 +unsigned int nr_hibernates;
5963 +EXPORT_SYMBOL_GPL(nr_hibernates);
5964 +
5965 +int toi_running;
5966 +EXPORT_SYMBOL_GPL(toi_running);
5967 +
5968 +int toi_in_hibernate __nosavedata;
5969 +EXPORT_SYMBOL_GPL(toi_in_hibernate);
5970 +
5971 +__nosavedata struct pbe *restore_highmem_pblist;
5972 +
5973 +#ifdef CONFIG_TOI_CORE_EXPORTS
5974 +#ifdef CONFIG_HIGHMEM
5975 +EXPORT_SYMBOL_GPL(nr_free_highpages);
5976 +EXPORT_SYMBOL_GPL(saveable_highmem_page);
5977 +EXPORT_SYMBOL_GPL(restore_highmem_pblist);
5978 +#endif
5979 +#endif
5980 +
5981 +#if defined(CONFIG_TOI_CORE_EXPORTS) || defined(CONFIG_TOI_PAGEFLAGS_EXPORTS)
5982 +EXPORT_SYMBOL_GPL(max_pfn);
5983 +#endif
5984 +
5985 +#if defined(CONFIG_TOI_EXPORTS) || defined(CONFIG_TOI_CORE_EXPORTS)
5986 +EXPORT_SYMBOL_GPL(snprintf_used);
5987 +#endif
5988 +
5989 +static int __init toi_wait_setup(char *str)
5990 +{
5991 +       int value;
5992 +
5993 +       if (sscanf(str, "=%d", &value)) {
5994 +               if (value < -1 || value > 255)
5995 +                       printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
5996 +                                       "255.\n");
5997 +               else
5998 +                       toi_wait = value;
5999 +       }
6000 +
6001 +       return 1;
6002 +}
6003 +
6004 +__setup("toi_wait", toi_wait_setup);
6005 diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
6006 new file mode 100644
6007 index 0000000..f4966c4
6008 --- /dev/null
6009 +++ b/kernel/power/tuxonice_builtin.h
6010 @@ -0,0 +1,31 @@
6011 +/*
6012 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
6013 + *
6014 + * This file is released under the GPLv2.
6015 + */
6016 +#include <linux/dyn_pageflags.h>
6017 +#include <asm/setup.h>
6018 +
6019 +extern struct toi_core_fns *toi_core_fns;
6020 +extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
6021 +extern unsigned int nr_hibernates;
6022 +extern int toi_in_hibernate;
6023 +
6024 +extern __nosavedata struct pbe *restore_highmem_pblist;
6025 +
6026 +int toi_lowlevel_builtin(void);
6027 +
6028 +extern struct dyn_pageflags __nosavedata toi_nosave_origmap;
6029 +extern struct dyn_pageflags __nosavedata toi_nosave_copymap;
6030 +
6031 +#ifdef CONFIG_HIGHMEM
6032 +extern __nosavedata struct zone_data *toi_nosave_zone_list;
6033 +extern __nosavedata unsigned long toi_nosave_max_pfn;
6034 +#endif
6035 +
6036 +extern unsigned long toi_get_nonconflicting_page(void);
6037 +extern int toi_post_context_save(void);
6038 +extern int toi_try_hibernate(int have_pmsem);
6039 +extern char toi_wait_for_keypress_dev_console(int timeout);
6040 +extern struct block_device *toi_open_by_devnum(dev_t dev, unsigned mode);
6041 +extern int toi_wait;
6042 diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
6043 new file mode 100644
6044 index 0000000..6e26130
6045 --- /dev/null
6046 +++ b/kernel/power/tuxonice_checksum.c
6047 @@ -0,0 +1,389 @@
6048 +/*
6049 + * kernel/power/tuxonice_checksum.c
6050 + *
6051 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
6052 + * Copyright (C) 2006 Red Hat, inc.
6053 + *
6054 + * This file is released under the GPLv2.
6055 + *
6056 + * This file contains data checksum routines for TuxOnIce,
6057 + * using cryptoapi. They are used to locate any modifications
6058 + * made to pageset 2 while we're saving it.
6059 + */
6060 +
6061 +#include <linux/suspend.h>
6062 +#include <linux/module.h>
6063 +#include <linux/highmem.h>
6064 +#include <linux/vmalloc.h>
6065 +#include <linux/crypto.h>
6066 +#include <linux/scatterlist.h>
6067 +
6068 +#include "tuxonice.h"
6069 +#include "tuxonice_modules.h"
6070 +#include "tuxonice_sysfs.h"
6071 +#include "tuxonice_io.h"
6072 +#include "tuxonice_pageflags.h"
6073 +#include "tuxonice_checksum.h"
6074 +#include "tuxonice_pagedir.h"
6075 +#include "tuxonice_alloc.h"
6076 +
6077 +static struct toi_module_ops toi_checksum_ops;
6078 +
6079 +/* Constant at the mo, but I might allow tuning later */
6080 +static char toi_checksum_name[32] = "md4";
6081 +/* Bytes per checksum */
6082 +#define CHECKSUM_SIZE (16)
6083 +
6084 +#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
6085 +
6086 +struct cpu_context {
6087 +       struct crypto_hash *transform;
6088 +       struct hash_desc desc;
6089 +       struct scatterlist sg[2];
6090 +       char *buf;
6091 +};
6092 +
6093 +static DEFINE_PER_CPU(struct cpu_context, contexts);
6094 +static int pages_allocated;
6095 +static unsigned long page_list;
6096 +
6097 +static int toi_num_resaved;
6098 +
6099 +static unsigned long this_checksum, next_page;
6100 +static int checksum_index;
6101 +
6102 +static inline int checksum_pages_needed(void)
6103 +{
6104 +       return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
6105 +}
6106 +
6107 +/* ---- Local buffer management ---- */
6108 +
6109 +/*
6110 + * toi_checksum_cleanup
6111 + *
6112 + * Frees memory allocated for our labours.
6113 + */
6114 +static void toi_checksum_cleanup(int ending_cycle)
6115 +{
6116 +       int cpu;
6117 +
6118 +       if (ending_cycle) {
6119 +               for_each_online_cpu(cpu) {
6120 +                       struct cpu_context *this = &per_cpu(contexts, cpu);
6121 +                       if (this->transform) {
6122 +                               crypto_free_hash(this->transform);
6123 +                               this->transform = NULL;
6124 +                               this->desc.tfm = NULL;
6125 +                       }
6126 +
6127 +                       if (this->buf) {
6128 +                               toi_free_page(27, (unsigned long) this->buf);
6129 +                               this->buf = NULL;
6130 +                       }
6131 +               }
6132 +       }
6133 +}
6134 +
6135 +/*
6136 + * toi_crypto_initialise
6137 + *
6138 + * Prepare to do some work by allocating buffers and transforms.
6139 + * Returns: Int: Zero. Even if we can't set up checksum, we still
6140 + * seek to hibernate.
6141 + */
6142 +static int toi_checksum_initialise(int starting_cycle)
6143 +{
6144 +       int cpu;
6145 +
6146 +       if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
6147 +               return 0;
6148 +
6149 +       if (!*toi_checksum_name) {
6150 +               printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
6151 +               return 1;
6152 +       }
6153 +
6154 +       for_each_online_cpu(cpu) {
6155 +               struct cpu_context *this = &per_cpu(contexts, cpu);
6156 +               struct page *page;
6157 +
6158 +               this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
6159 +               if (IS_ERR(this->transform)) {
6160 +                       printk(KERN_INFO "TuxOnIce: Failed to initialise the "
6161 +                               "%s checksum algorithm: %ld.\n",
6162 +                               toi_checksum_name, (long) this->transform);
6163 +                       this->transform = NULL;
6164 +                       return 1;
6165 +               }
6166 +
6167 +               this->desc.tfm = this->transform;
6168 +               this->desc.flags = 0;
6169 +
6170 +               page = toi_alloc_page(27, GFP_KERNEL);
6171 +               if (!page)
6172 +                       return 1;
6173 +               this->buf = page_address(page);
6174 +               sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
6175 +       }
6176 +       return 0;
6177 +}
6178 +
6179 +/*
6180 + * toi_checksum_print_debug_stats
6181 + * @buffer: Pointer to a buffer into which the debug info will be printed.
6182 + * @size: Size of the buffer.
6183 + *
6184 + * Print information to be recorded for debugging purposes into a buffer.
6185 + * Returns: Number of characters written to the buffer.
6186 + */
6187 +
6188 +static int toi_checksum_print_debug_stats(char *buffer, int size)
6189 +{
6190 +       int len;
6191 +
6192 +       if (!toi_checksum_ops.enabled)
6193 +               return snprintf_used(buffer, size,
6194 +                       "- Checksumming disabled.\n");
6195 +
6196 +       len = snprintf_used(buffer, size, "- Checksum method is '%s'.\n",
6197 +                       toi_checksum_name);
6198 +       len += snprintf_used(buffer + len, size - len,
6199 +               "  %d pages resaved in atomic copy.\n", toi_num_resaved);
6200 +       return len;
6201 +}
6202 +
6203 +static int toi_checksum_memory_needed(void)
6204 +{
6205 +       return toi_checksum_ops.enabled ?
6206 +               checksum_pages_needed() << PAGE_SHIFT : 0;
6207 +}
6208 +
6209 +static int toi_checksum_storage_needed(void)
6210 +{
6211 +       if (toi_checksum_ops.enabled)
6212 +               return strlen(toi_checksum_name) + sizeof(int) + 1;
6213 +       else
6214 +               return 0;
6215 +}
6216 +
6217 +/*
6218 + * toi_checksum_save_config_info
6219 + * @buffer: Pointer to a buffer of size PAGE_SIZE.
6220 + *
6221 + * Save informaton needed when reloading the image at resume time.
6222 + * Returns: Number of bytes used for saving our data.
6223 + */
6224 +static int toi_checksum_save_config_info(char *buffer)
6225 +{
6226 +       int namelen = strlen(toi_checksum_name) + 1;
6227 +       int total_len;
6228 +
6229 +       *((unsigned int *) buffer) = namelen;
6230 +       strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
6231 +       total_len = sizeof(unsigned int) + namelen;
6232 +       return total_len;
6233 +}
6234 +
6235 +/* toi_checksum_load_config_info
6236 + * @buffer: Pointer to the start of the data.
6237 + * @size: Number of bytes that were saved.
6238 + *
6239 + * Description:        Reload information needed for dechecksuming the image at
6240 + * resume time.
6241 + */
6242 +static void toi_checksum_load_config_info(char *buffer, int size)
6243 +{
6244 +       int namelen;
6245 +
6246 +       namelen = *((unsigned int *) (buffer));
6247 +       strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
6248 +                       namelen);
6249 +       return;
6250 +}
6251 +
6252 +/*
6253 + * Free Checksum Memory
6254 + */
6255 +
6256 +void free_checksum_pages(void)
6257 +{
6258 +       while (pages_allocated) {
6259 +               unsigned long next = *((unsigned long *) page_list);
6260 +               ClearPageNosave(virt_to_page(page_list));
6261 +               toi_free_page(15, (unsigned long) page_list);
6262 +               page_list = next;
6263 +               pages_allocated--;
6264 +       }
6265 +}
6266 +
6267 +/*
6268 + * Allocate Checksum Memory
6269 + */
6270 +
6271 +int allocate_checksum_pages(void)
6272 +{
6273 +       int pages_needed = checksum_pages_needed();
6274 +
6275 +       if (!toi_checksum_ops.enabled)
6276 +               return 0;
6277 +
6278 +       while (pages_allocated < pages_needed) {
6279 +               unsigned long *new_page =
6280 +                 (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
6281 +               if (!new_page) {
6282 +                       printk("Unable to allocate checksum pages.\n");
6283 +                       return -ENOMEM;
6284 +               }
6285 +               SetPageNosave(virt_to_page(new_page));
6286 +               (*new_page) = page_list;
6287 +               page_list = (unsigned long) new_page;
6288 +               pages_allocated++;
6289 +       }
6290 +
6291 +       next_page = (unsigned long) page_list;
6292 +       checksum_index = 0;
6293 +
6294 +       return 0;
6295 +}
6296 +
6297 +#if 0
6298 +static void print_checksum(char *buf, int size)
6299 +{
6300 +       int index;
6301 +
6302 +       for (index = 0; index < size; index++)
6303 +               printk(KERN_INFO "%x ", buf[index]);
6304 +
6305 +       printk("\n");
6306 +}
6307 +#endif
6308 +
6309 +char *tuxonice_get_next_checksum(void)
6310 +{
6311 +       if (!toi_checksum_ops.enabled)
6312 +               return NULL;
6313 +
6314 +       if (checksum_index % CHECKSUMS_PER_PAGE)
6315 +               this_checksum += CHECKSUM_SIZE;
6316 +       else {
6317 +               this_checksum = next_page + sizeof(void *);
6318 +               next_page = *((unsigned long *) next_page);
6319 +       }
6320 +
6321 +       checksum_index++;
6322 +       return (char *) this_checksum;
6323 +}
6324 +
6325 +int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
6326 +{
6327 +       char *pa;
6328 +       int result, cpu = smp_processor_id();
6329 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
6330 +
6331 +       if (!toi_checksum_ops.enabled)
6332 +               return 0;
6333 +
6334 +       pa = kmap(page);
6335 +       memcpy(ctx->buf, pa, PAGE_SIZE);
6336 +       kunmap(page);
6337 +       result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
6338 +                                               checksum_locn);
6339 +       return result;
6340 +}
6341 +/*
6342 + * Calculate checksums
6343 + */
6344 +
6345 +void check_checksums(void)
6346 +{
6347 +       int pfn, index = 0, cpu = smp_processor_id();
6348 +       unsigned long next_page, this_checksum = 0;
6349 +       char current_checksum[CHECKSUM_SIZE];
6350 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
6351 +
6352 +       if (!toi_checksum_ops.enabled)
6353 +               return;
6354 +
6355 +       next_page = (unsigned long) page_list;
6356 +
6357 +       toi_num_resaved = 0;
6358 +
6359 +       BITMAP_FOR_EACH_SET(&pageset2_map, pfn) {
6360 +               int ret;
6361 +               char *pa;
6362 +               struct page *page = pfn_to_page(pfn);
6363 +
6364 +               if (index % CHECKSUMS_PER_PAGE) {
6365 +                       this_checksum += CHECKSUM_SIZE;
6366 +               } else {
6367 +                       this_checksum = next_page + sizeof(void *);
6368 +                       next_page = *((unsigned long *) next_page);
6369 +               }
6370 +
6371 +               /* Done when IRQs disabled so must be atomic */
6372 +               pa = kmap_atomic(page, KM_USER1);
6373 +               memcpy(ctx->buf, pa, PAGE_SIZE);
6374 +               kunmap_atomic(pa, KM_USER1);
6375 +               ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
6376 +                                                       current_checksum);
6377 +
6378 +               if (ret) {
6379 +                       printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
6380 +                       return;
6381 +               }
6382 +
6383 +               if (memcmp(current_checksum, (char *) this_checksum,
6384 +                                                       CHECKSUM_SIZE)) {
6385 +                       SetPageResave(pfn_to_page(pfn));
6386 +                       toi_num_resaved++;
6387 +                       if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
6388 +                               set_abort_result(TOI_RESAVE_NEEDED);
6389 +               }
6390 +
6391 +               index++;
6392 +       }
6393 +}
6394 +
6395 +static struct toi_sysfs_data sysfs_params[] = {
6396 +       { TOI_ATTR("enabled", SYSFS_RW),
6397 +         SYSFS_INT(&toi_checksum_ops.enabled, 0, 1, 0)
6398 +       },
6399 +
6400 +       { TOI_ATTR("abort_if_resave_needed", SYSFS_RW),
6401 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_ABORT_ON_RESAVE_NEEDED, 0)
6402 +       }
6403 +};
6404 +
6405 +/*
6406 + * Ops structure.
6407 + */
6408 +static struct toi_module_ops toi_checksum_ops = {
6409 +       .type                   = MISC_MODULE,
6410 +       .name                   = "checksumming",
6411 +       .directory              = "checksum",
6412 +       .module                 = THIS_MODULE,
6413 +       .initialise             = toi_checksum_initialise,
6414 +       .cleanup                = toi_checksum_cleanup,
6415 +       .print_debug_info       = toi_checksum_print_debug_stats,
6416 +       .save_config_info       = toi_checksum_save_config_info,
6417 +       .load_config_info       = toi_checksum_load_config_info,
6418 +       .memory_needed          = toi_checksum_memory_needed,
6419 +       .storage_needed         = toi_checksum_storage_needed,
6420 +
6421 +       .sysfs_data             = sysfs_params,
6422 +       .num_sysfs_entries      = sizeof(sysfs_params) /
6423 +               sizeof(struct toi_sysfs_data),
6424 +};
6425 +
6426 +/* ---- Registration ---- */
6427 +int toi_checksum_init(void)
6428 +{
6429 +       int result = toi_register_module(&toi_checksum_ops);
6430 +       return result;
6431 +}
6432 +
6433 +void toi_checksum_exit(void)
6434 +{
6435 +       toi_unregister_module(&toi_checksum_ops);
6436 +}
6437 diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
6438 new file mode 100644
6439 index 0000000..81b928d
6440 --- /dev/null
6441 +++ b/kernel/power/tuxonice_checksum.h
6442 @@ -0,0 +1,32 @@
6443 +/*
6444 + * kernel/power/tuxonice_checksum.h
6445 + *
6446 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
6447 + * Copyright (C) 2006 Red Hat, inc.
6448 + *
6449 + * This file is released under the GPLv2.
6450 + *
6451 + * This file contains data checksum routines for TuxOnIce,
6452 + * using cryptoapi. They are used to locate any modifications
6453 + * made to pageset 2 while we're saving it.
6454 + */
6455 +
6456 +#if defined(CONFIG_TOI_CHECKSUM)
6457 +extern int toi_checksum_init(void);
6458 +extern void toi_checksum_exit(void);
6459 +void check_checksums(void);
6460 +int allocate_checksum_pages(void);
6461 +void free_checksum_pages(void);
6462 +char *tuxonice_get_next_checksum(void);
6463 +int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
6464 +#else
6465 +static inline int toi_checksum_init(void) { return 0; }
6466 +static inline void toi_checksum_exit(void) { }
6467 +static inline void check_checksums(void) { };
6468 +static inline int allocate_checksum_pages(void) { return 0; };
6469 +static inline void free_checksum_pages(void) { };
6470 +static inline char *tuxonice_get_next_checksum(void) { return NULL; };
6471 +static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
6472 +       { return 0; }
6473 +#endif
6474 +
6475 diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
6476 new file mode 100644
6477 index 0000000..b5c9ea1
6478 --- /dev/null
6479 +++ b/kernel/power/tuxonice_cluster.c
6480 @@ -0,0 +1,1088 @@
6481 +/*
6482 + * kernel/power/tuxonice_cluster.c
6483 + *
6484 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
6485 + *
6486 + * This file is released under the GPLv2.
6487 + *
6488 + * This file contains routines for cluster hibernation support.
6489 + *
6490 + * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
6491 + *
6492 + * How does it work?
6493 + *
6494 + * There is no 'master' node that tells everyone else what to do. All nodes
6495 + * send messages to the broadcast address/port, maintain a list of peers
6496 + * and figure out when to progress to the next step in hibernating or resuming.
6497 + * This makes us more fault tolerant when it comes to nodes coming and going
6498 + * (which may be more of an issue if we're hibernating when power supplies
6499 + * are being unreliable).
6500 + *
6501 + * At boot time, we start a ktuxonice thread that handles communication with
6502 + * other nodes. This node maintains a state machine that controls our progress
6503 + * through hibernating and resuming, keeping us in step with other nodes. Nodes
6504 + * are identified by their hw address.
6505 + *
6506 + * On startup, the node sends CLUSTER_PING on the configured interface's
6507 + * broadcast address, port $toi_cluster_port (see below) and begins to listen
6508 + * for other broadcast messages. CLUSTER_PING messages are repeated at
6509 + * intervals of 5 minutes, with a random offset to spread traffic out.
6510 + *
6511 + * A hibernation cycle is initiated from any node via
6512 + *
6513 + * echo > /sys/power/tuxonice/do_hibernate
6514 + *
6515 + * and (possibily) the hibernate script. At each step of the process, the node
6516 + * completes its work, and waits for all other nodes to signal completion of
6517 + * their work (or timeout) before progressing to the next step.
6518 + *
6519 + * Request/state  Action before reply  Possible reply  Next state
6520 + * HIBERNATE     capable, pre-script   HIBERNATE|ACK   NODE_PREP
6521 + *                                     HIBERNATE|NACK  INIT_0
6522 + *
6523 + * PREP                  prepare_image         PREP|ACK        IMAGE_WRITE
6524 + *                                     PREP|NACK       INIT_0
6525 + *                                     ABORT           RUNNING
6526 + *
6527 + * IO            write image           IO|ACK          power off
6528 + *                                     ABORT           POST_RESUME
6529 + *
6530 + * (Boot time)   check for image       IMAGE|ACK       RESUME_PREP
6531 + *                                     (Note 1)
6532 + *                                     IMAGE|NACK      (Note 2)
6533 + *
6534 + * PREP                  prepare read image    PREP|ACK        IMAGE_READ
6535 + *                                     PREP|NACK       (As NACK_IMAGE)
6536 + *
6537 + * IO            read image            IO|ACK          POST_RESUME
6538 + *
6539 + * POST_RESUME   thaw, post-script                     RUNNING
6540 + *
6541 + * INIT_0        init 0
6542 + *
6543 + * Other messages:
6544 + *
6545 + * - PING: Request for all other live nodes to send a PONG. Used at startup to
6546 + *   announce presence, when a node is suspected dead and periodically, in case
6547 + *   segments of the network are [un]plugged.
6548 + *
6549 + * - PONG: Response to a PING.
6550 + *
6551 + * - ABORT: Request to cancel writing an image.
6552 + *
6553 + * - BYE: Notification that this node is shutting down.
6554 + *
6555 + * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
6556 + * nodes which are slower to start up can get state synchronised. If a node
6557 + * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
6558 + * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
6559 + * must invalidate its image (if any) and boot normally.
6560 + *
6561 + * Note 2: May occur when one node lost power or powered off while others
6562 + * hibernated. This node waits for others to complete resuming (ACK_READ)
6563 + * before completing its boot, so that it appears as a fail node restarting.
6564 + *
6565 + * If any node has an image, then it also has a list of nodes that hibernated
6566 + * in synchronisation with it. The node will wait for other nodes to appear
6567 + * or timeout before beginning its restoration.
6568 + *
6569 + * If a node has no image, it needs to wait, in case other nodes which do have
6570 + * an image are going to resume, but are taking longer to announce their
6571 + * presence. For this reason, the user can specify a timeout value and a number
6572 + * of nodes detected before we just continue. (We might want to assume in a
6573 + * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
6574 + * the remaining nodes will too. This might help in situations where some nodes
6575 + * are much slower to boot, or more subject to hardware failures or such like).
6576 + */
6577 +
6578 +#include <linux/suspend.h>
6579 +#include <linux/module.h>
6580 +#include <linux/moduleparam.h>
6581 +#include <linux/if.h>
6582 +#include <linux/rtnetlink.h>
6583 +#include <linux/ip.h>
6584 +#include <linux/udp.h>
6585 +#include <linux/in.h>
6586 +#include <linux/if_arp.h>
6587 +#include <linux/kthread.h>
6588 +#include <linux/wait.h>
6589 +#include <linux/netdevice.h>
6590 +#include <net/ip.h>
6591 +
6592 +#include "tuxonice.h"
6593 +#include "tuxonice_modules.h"
6594 +#include "tuxonice_sysfs.h"
6595 +#include "tuxonice_alloc.h"
6596 +#include "tuxonice_io.h"
6597 +
6598 +#if 1
6599 +#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
6600 +#else
6601 +#define PRINTK(a, b...) do { } while (0)
6602 +#endif
6603 +
6604 +static int loopback_mode;
6605 +static int num_local_nodes = 1;
6606 +#define MAX_LOCAL_NODES 8
6607 +#define SADDR (loopback_mode ? b->sid : h->saddr)
6608 +
6609 +#define MYNAME "TuxOnIce Clustering"
6610 +
6611 +enum cluster_message {
6612 +       MSG_ACK = 1,
6613 +       MSG_NACK = 2,
6614 +       MSG_PING = 4,
6615 +       MSG_ABORT = 8,
6616 +       MSG_BYE = 16,
6617 +       MSG_HIBERNATE = 32,
6618 +       MSG_IMAGE = 64,
6619 +       MSG_IO = 128,
6620 +       MSG_RUNNING = 256
6621 +};
6622 +
6623 +static char *str_message(int message)
6624 +{
6625 +       switch (message) {
6626 +       case 4:
6627 +               return "Ping";
6628 +       case 8:
6629 +               return "Abort";
6630 +       case 9:
6631 +               return "Abort acked";
6632 +       case 10:
6633 +               return "Abort nacked";
6634 +       case 16:
6635 +               return "Bye";
6636 +       case 17:
6637 +               return "Bye acked";
6638 +       case 18:
6639 +               return "Bye nacked";
6640 +       case 32:
6641 +               return "Hibernate request";
6642 +       case 33:
6643 +               return "Hibernate ack";
6644 +       case 34:
6645 +               return "Hibernate nack";
6646 +       case 64:
6647 +               return "Image exists?";
6648 +       case 65:
6649 +               return "Image does exist";
6650 +       case 66:
6651 +               return "No image here";
6652 +       case 128:
6653 +               return "I/O";
6654 +       case 129:
6655 +               return "I/O okay";
6656 +       case 130:
6657 +               return "I/O failed";
6658 +       case 256:
6659 +               return "Running";
6660 +       default:
6661 +               printk("Unrecognised message %d.\n", message);
6662 +               return "Unrecognised message (see dmesg)";
6663 +       }
6664 +}
6665 +
6666 +#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
6667 +#define MSG_STATE_MASK (~MSG_ACK_MASK)
6668 +
6669 +struct node_info {
6670 +       struct list_head member_list;
6671 +       wait_queue_head_t member_events;
6672 +       spinlock_t member_list_lock;
6673 +       spinlock_t receive_lock;
6674 +       int peer_count, ignored_peer_count;
6675 +       struct toi_sysfs_data sysfs_data;
6676 +       enum cluster_message current_message;
6677 +};
6678 +
6679 +struct node_info node_array[MAX_LOCAL_NODES];
6680 +
6681 +struct cluster_member {
6682 +       __be32 addr;
6683 +       enum cluster_message message;
6684 +       struct list_head list;
6685 +       int ignore;
6686 +};
6687 +
6688 +#define toi_cluster_port_send 3501
6689 +#define toi_cluster_port_recv 3502
6690 +
6691 +static struct net_device *net_dev;
6692 +static struct toi_module_ops toi_cluster_ops;
6693 +
6694 +static int toi_recv(struct sk_buff *skb, struct net_device *dev,
6695 +               struct packet_type *pt, struct net_device *orig_dev);
6696 +
6697 +static struct packet_type toi_cluster_packet_type = {
6698 +       .type = __constant_htons(ETH_P_IP),
6699 +       .func = toi_recv,
6700 +};
6701 +
6702 +struct toi_pkt {               /* BOOTP packet format */
6703 +       struct iphdr iph;       /* IP header */
6704 +       struct udphdr udph;     /* UDP header */
6705 +       u8 htype;               /* HW address type */
6706 +       u8 hlen;                /* HW address length */
6707 +       __be32 xid;             /* Transaction ID */
6708 +       __be16 secs;            /* Seconds since we started */
6709 +       __be16 flags;           /* Just what it says */
6710 +       u8 hw_addr[16];         /* Sender's HW address */
6711 +       u16 message;            /* Message */
6712 +       unsigned long sid;      /* Source ID for loopback testing */
6713 +};
6714 +
6715 +static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
6716 +
6717 +static int added_pack;
6718 +
6719 +static int others_have_image;
6720 +
6721 +/* Key used to allow multiple clusters on the same lan */
6722 +static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
6723 +static char pre_hibernate_script[255] =
6724 +       CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
6725 +static char post_hibernate_script[255] =
6726 +       CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
6727 +
6728 +/*                     List of cluster members                 */
6729 +static unsigned long continue_delay = 5 * HZ;
6730 +static unsigned long cluster_message_timeout = 3 * HZ;
6731 +
6732 +/*             === Membership list ===         */
6733 +
6734 +static void print_member_info(int index)
6735 +{
6736 +       struct cluster_member *this;
6737 +
6738 +       printk(KERN_INFO "==> Dumping node %d.\n", index);
6739 +
6740 +       list_for_each_entry(this, &node_array[index].member_list, list)
6741 +               printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
6742 +                               NIPQUAD(this->addr),
6743 +                               str_message(this->message),
6744 +                               this->ignore ? "(Ignored)" : "");
6745 +       printk(KERN_INFO "== Done ==\n");
6746 +}
6747 +
6748 +static struct cluster_member *__find_member(int index, __be32 addr)
6749 +{
6750 +       struct cluster_member *this;
6751 +
6752 +       list_for_each_entry(this, &node_array[index].member_list, list) {
6753 +               if (this->addr != addr)
6754 +                       continue;
6755 +
6756 +               return this;
6757 +       }
6758 +
6759 +       return NULL;
6760 +}
6761 +
6762 +static void set_ignore(int index, __be32 addr, struct cluster_member *this)
6763 +{
6764 +       if (this->ignore) {
6765 +               PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
6766 +                               index, NIPQUAD(addr));
6767 +               return;
6768 +       }
6769 +
6770 +       PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
6771 +                               index, NIPQUAD(addr));
6772 +       this->ignore = 1;
6773 +       node_array[index].ignored_peer_count++;
6774 +}
6775 +
6776 +static int __add_update_member(int index, __be32 addr, int message)
6777 +{
6778 +       struct cluster_member *this;
6779 +
6780 +       this = __find_member(index, addr);
6781 +       if (this) {
6782 +               if (this->message != message) {
6783 +                       this->message = message;
6784 +                       if ((message & MSG_NACK) &&
6785 +                           (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
6786 +                               set_ignore(index, addr, this);
6787 +                       PRINTK("Node %d sees node %d.%d.%d.%d now sending "
6788 +                                       "%s.\n", index, NIPQUAD(addr),
6789 +                                       str_message(message));
6790 +                       wake_up(&node_array[index].member_events);
6791 +               }
6792 +               return 0;
6793 +       }
6794 +
6795 +       this = (struct cluster_member *) toi_kzalloc(36,
6796 +                       sizeof(struct cluster_member), GFP_KERNEL);
6797 +
6798 +       if (!this)
6799 +               return -1;
6800 +
6801 +       this->addr = addr;
6802 +       this->message = message;
6803 +       this->ignore = 0;
6804 +       INIT_LIST_HEAD(&this->list);
6805 +
6806 +       node_array[index].peer_count++;
6807 +
6808 +       PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
6809 +                       NIPQUAD(addr), str_message(message));
6810 +
6811 +       if ((message & MSG_NACK) &&
6812 +           (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
6813 +               set_ignore(index, addr, this);
6814 +       list_add_tail(&this->list, &node_array[index].member_list);
6815 +       return 1;
6816 +}
6817 +
6818 +static int add_update_member(int index, __be32 addr, int message)
6819 +{
6820 +       int result;
6821 +       unsigned long flags;
6822 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
6823 +       result = __add_update_member(index, addr, message);
6824 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
6825 +
6826 +       print_member_info(index);
6827 +
6828 +       wake_up(&node_array[index].member_events);
6829 +
6830 +       return result;
6831 +}
6832 +
6833 +static void del_member(int index, __be32 addr)
6834 +{
6835 +       struct cluster_member *this;
6836 +       unsigned long flags;
6837 +
6838 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
6839 +       this = __find_member(index, addr);
6840 +
6841 +       if (this) {
6842 +               list_del_init(&this->list);
6843 +               toi_kfree(36, this);
6844 +               node_array[index].peer_count--;
6845 +       }
6846 +
6847 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
6848 +}
6849 +
6850 +/*             === Message transmission ===    */
6851 +
6852 +static void toi_send_if(int message, unsigned long my_id);
6853 +
6854 +/*
6855 + *  Process received TOI packet.
6856 + */
6857 +static int toi_recv(struct sk_buff *skb, struct net_device *dev,
6858 +               struct packet_type *pt, struct net_device *orig_dev)
6859 +{
6860 +       struct toi_pkt *b;
6861 +       struct iphdr *h;
6862 +       int len, result, index;
6863 +       unsigned long addr, message, ack;
6864 +
6865 +       /* Perform verifications before taking the lock.  */
6866 +       if (skb->pkt_type == PACKET_OTHERHOST)
6867 +               goto drop;
6868 +
6869 +       if (dev != net_dev)
6870 +               goto drop;
6871 +
6872 +       skb = skb_share_check(skb, GFP_ATOMIC);
6873 +       if (!skb)
6874 +               return NET_RX_DROP;
6875 +
6876 +       if (!pskb_may_pull(skb,
6877 +                          sizeof(struct iphdr) +
6878 +                          sizeof(struct udphdr)))
6879 +               goto drop;
6880 +
6881 +       b = (struct toi_pkt *)skb_network_header(skb);
6882 +       h = &b->iph;
6883 +
6884 +       if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
6885 +               goto drop;
6886 +
6887 +       /* Fragments are not supported */
6888 +       if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
6889 +               if (net_ratelimit())
6890 +                       printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
6891 +                              "cluster message.\n");
6892 +               goto drop;
6893 +       }
6894 +
6895 +       if (skb->len < ntohs(h->tot_len))
6896 +               goto drop;
6897 +
6898 +       if (ip_fast_csum((char *) h, h->ihl))
6899 +               goto drop;
6900 +
6901 +       if (b->udph.source != htons(toi_cluster_port_send) ||
6902 +           b->udph.dest != htons(toi_cluster_port_recv))
6903 +               goto drop;
6904 +
6905 +       if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
6906 +               goto drop;
6907 +
6908 +       len = ntohs(b->udph.len) - sizeof(struct udphdr);
6909 +
6910 +       /* Ok the front looks good, make sure we can get at the rest.  */
6911 +       if (!pskb_may_pull(skb, skb->len))
6912 +               goto drop;
6913 +
6914 +       b = (struct toi_pkt *)skb_network_header(skb);
6915 +       h = &b->iph;
6916 +
6917 +       addr = SADDR;
6918 +       PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
6919 +                       str_message(b->message), NIPQUAD(addr));
6920 +
6921 +       message = b->message & MSG_STATE_MASK;
6922 +       ack = b->message & MSG_ACK_MASK;
6923 +
6924 +       for (index = 0; index < num_local_nodes; index++) {
6925 +               int new_message = node_array[index].current_message,
6926 +                   old_message = new_message;
6927 +
6928 +               if (index == SADDR || !old_message) {
6929 +                       PRINTK("Ignoring node %d (offline or self).\n", index);
6930 +                       continue;
6931 +               }
6932 +
6933 +               /* One message at a time, please. */
6934 +               spin_lock(&node_array[index].receive_lock);
6935 +
6936 +               result = add_update_member(index, SADDR, b->message);
6937 +               if (result == -1) {
6938 +                       printk(KERN_INFO "Failed to add new cluster member "
6939 +                                       NIPQUAD_FMT ".\n",
6940 +                                       NIPQUAD(addr));
6941 +                       goto drop_unlock;
6942 +               }
6943 +
6944 +               switch (b->message & MSG_STATE_MASK) {
6945 +               case MSG_PING:
6946 +                       break;
6947 +               case MSG_ABORT:
6948 +                       break;
6949 +               case MSG_BYE:
6950 +                       break;
6951 +               case MSG_HIBERNATE:
6952 +                       /* Can I hibernate? */
6953 +                       new_message = MSG_HIBERNATE |
6954 +                               ((index & 1) ? MSG_NACK : MSG_ACK);
6955 +                       break;
6956 +               case MSG_IMAGE:
6957 +                       /* Can I resume? */
6958 +                       new_message = MSG_IMAGE |
6959 +                               ((index & 1) ? MSG_NACK : MSG_ACK);
6960 +                       if (new_message != old_message)
6961 +                               printk("Setting whether I can resume to %d.\n",
6962 +                                               new_message);
6963 +                       break;
6964 +               case MSG_IO:
6965 +                       new_message = MSG_IO | MSG_ACK;
6966 +                       break;
6967 +               case MSG_RUNNING:
6968 +                       break;
6969 +               default:
6970 +                       if (net_ratelimit())
6971 +                               printk(KERN_ERR "Unrecognised TuxOnIce cluster"
6972 +                                       " message %d from " NIPQUAD_FMT ".\n",
6973 +                                       b->message, NIPQUAD(addr));
6974 +               };
6975 +
6976 +               if (old_message != new_message) {
6977 +                       node_array[index].current_message = new_message;
6978 +                       printk(KERN_INFO ">>> Sending new message for node "
6979 +                                       "%d.\n", index);
6980 +                       toi_send_if(new_message, index);
6981 +               } else if (!ack) {
6982 +                       printk(KERN_INFO ">>> Resending message for node %d.\n",
6983 +                                       index);
6984 +                       toi_send_if(new_message, index);
6985 +               }
6986 +drop_unlock:
6987 +               spin_unlock(&node_array[index].receive_lock);
6988 +       };
6989 +
6990 +drop:
6991 +       /* Throw the packet out. */
6992 +       kfree_skb(skb);
6993 +
6994 +       return 0;
6995 +}
6996 +
6997 +/*
6998 + *  Send cluster message to single interface.
6999 + */
7000 +static void toi_send_if(int message, unsigned long my_id)
7001 +{
7002 +       struct sk_buff *skb;
7003 +       struct toi_pkt *b;
7004 +       int hh_len = LL_RESERVED_SPACE(net_dev);
7005 +       struct iphdr *h;
7006 +
7007 +       /* Allocate packet */
7008 +       skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
7009 +       if (!skb)
7010 +               return;
7011 +       skb_reserve(skb, hh_len);
7012 +       b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
7013 +       memset(b, 0, sizeof(struct toi_pkt));
7014 +
7015 +       /* Construct IP header */
7016 +       skb_reset_network_header(skb);
7017 +       h = ip_hdr(skb);
7018 +       h->version = 4;
7019 +       h->ihl = 5;
7020 +       h->tot_len = htons(sizeof(struct toi_pkt));
7021 +       h->frag_off = htons(IP_DF);
7022 +       h->ttl = 64;
7023 +       h->protocol = IPPROTO_UDP;
7024 +       h->daddr = htonl(INADDR_BROADCAST);
7025 +       h->check = ip_fast_csum((unsigned char *) h, h->ihl);
7026 +
7027 +       /* Construct UDP header */
7028 +       b->udph.source = htons(toi_cluster_port_send);
7029 +       b->udph.dest = htons(toi_cluster_port_recv);
7030 +       b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
7031 +       /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
7032 +
7033 +       /* Construct message */
7034 +       b->message = message;
7035 +       b->sid = my_id;
7036 +       b->htype = net_dev->type; /* can cause undefined behavior */
7037 +       b->hlen = net_dev->addr_len;
7038 +       memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
7039 +       b->secs = htons(3); /* 3 seconds */
7040 +
7041 +       /* Chain packet down the line... */
7042 +       skb->dev = net_dev;
7043 +       skb->protocol = htons(ETH_P_IP);
7044 +       if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
7045 +                    net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
7046 +                       dev_queue_xmit(skb) < 0)
7047 +               printk(KERN_INFO "E");
7048 +}
7049 +
7050 +/*     =========================================               */
7051 +
7052 +/*                     kTOICluster                     */
7053 +
7054 +static atomic_t num_cluster_threads;
7055 +static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
7056 +
7057 +static int kTOICluster(void *data)
7058 +{
7059 +       unsigned long my_id;
7060 +
7061 +       my_id = atomic_add_return(1, &num_cluster_threads) - 1;
7062 +       node_array[my_id].current_message = (unsigned long) data;
7063 +
7064 +       PRINTK("kTOICluster daemon %lu starting.\n", my_id);
7065 +
7066 +       current->flags |= PF_NOFREEZE;
7067 +
7068 +       while (node_array[my_id].current_message) {
7069 +               toi_send_if(node_array[my_id].current_message, my_id);
7070 +               sleep_on_timeout(&clusterd_events,
7071 +                               cluster_message_timeout);
7072 +               PRINTK("Link state %lu is %d.\n", my_id,
7073 +                               node_array[my_id].current_message);
7074 +       }
7075 +
7076 +       toi_send_if(MSG_BYE, my_id);
7077 +       atomic_dec(&num_cluster_threads);
7078 +       wake_up(&clusterd_events);
7079 +
7080 +       PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
7081 +       __set_current_state(TASK_RUNNING);
7082 +       return 0;
7083 +}
7084 +
7085 +static void kill_clusterd(void)
7086 +{
7087 +       int i;
7088 +
7089 +       for (i = 0; i < num_local_nodes; i++) {
7090 +               if (node_array[i].current_message) {
7091 +                       PRINTK("Seeking to kill clusterd %d.\n", i);
7092 +                       node_array[i].current_message = 0;
7093 +               }
7094 +       }
7095 +       wait_event(clusterd_events,
7096 +                       !atomic_read(&num_cluster_threads));
7097 +       PRINTK("All cluster daemons have exited.\n");
7098 +}
7099 +
7100 +static int peers_not_in_message(int index, int message, int precise)
7101 +{
7102 +       struct cluster_member *this;
7103 +       unsigned long flags;
7104 +       int result = 0;
7105 +
7106 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
7107 +       list_for_each_entry(this, &node_array[index].member_list, list) {
7108 +               if (this->ignore)
7109 +                       continue;
7110 +
7111 +               PRINTK("Peer %d.%d.%d.%d sending %s. "
7112 +                       "Seeking %s.\n",
7113 +                       NIPQUAD(this->addr),
7114 +                       str_message(this->message), str_message(message));
7115 +               if ((precise ? this->message :
7116 +                                       this->message & MSG_STATE_MASK) !=
7117 +                                       message)
7118 +                       result++;
7119 +       }
7120 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
7121 +       PRINTK("%d peers in sought message.\n", result);
7122 +       return result;
7123 +}
7124 +
7125 +static void reset_ignored(int index)
7126 +{
7127 +       struct cluster_member *this;
7128 +       unsigned long flags;
7129 +
7130 +       spin_lock_irqsave(&node_array[index].member_list_lock, flags);
7131 +       list_for_each_entry(this, &node_array[index].member_list, list)
7132 +               this->ignore = 0;
7133 +       node_array[index].ignored_peer_count = 0;
7134 +       spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
7135 +}
7136 +
7137 +static int peers_in_message(int index, int message, int precise)
7138 +{
7139 +       return node_array[index].peer_count -
7140 +               node_array[index].ignored_peer_count -
7141 +               peers_not_in_message(index, message, precise);
7142 +}
7143 +
7144 +static int time_to_continue(int index, unsigned long start, int message)
7145 +{
7146 +       int first = peers_not_in_message(index, message, 0);
7147 +       int second = peers_in_message(index, message, 1);
7148 +
7149 +       PRINTK("First part returns %d, second returns %d.\n", first, second);
7150 +
7151 +       if (!first && !second) {
7152 +               PRINTK("All peers answered message %d.\n",
7153 +                       message);
7154 +               return 1;
7155 +       }
7156 +
7157 +       if (time_after(jiffies, start + continue_delay)) {
7158 +               PRINTK("Timeout reached.\n");
7159 +               return 1;
7160 +       }
7161 +
7162 +       PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
7163 +                       start + continue_delay);
7164 +       return 0;
7165 +}
7166 +
7167 +void toi_initiate_cluster_hibernate(void)
7168 +{
7169 +       int result;
7170 +       unsigned long start;
7171 +
7172 +       result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
7173 +       if (result)
7174 +               return;
7175 +
7176 +       toi_send_if(MSG_HIBERNATE, 0);
7177 +
7178 +       start = jiffies;
7179 +       wait_event(node_array[0].member_events,
7180 +                       time_to_continue(0, start, MSG_HIBERNATE));
7181 +
7182 +       if (test_action_state(TOI_FREEZER_TEST)) {
7183 +               toi_send_if(MSG_ABORT, 0);
7184 +
7185 +               start = jiffies;
7186 +               wait_event(node_array[0].member_events,
7187 +                       time_to_continue(0, start, MSG_RUNNING));
7188 +
7189 +               do_toi_step(STEP_QUIET_CLEANUP);
7190 +               return;
7191 +       }
7192 +
7193 +       toi_send_if(MSG_IO, 0);
7194 +
7195 +       result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
7196 +       if (result)
7197 +               return;
7198 +
7199 +       /* This code runs at resume time too! */
7200 +       if (toi_in_hibernate)
7201 +               result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
7202 +}
7203 +EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate);
7204 +
7205 +/* toi_cluster_print_debug_stats
7206 + *
7207 + * Description:        Print information to be recorded for debugging purposes into a
7208 + *             buffer.
7209 + * Arguments:  buffer: Pointer to a buffer into which the debug info will be
7210 + *                     printed.
7211 + *             size:   Size of the buffer.
7212 + * Returns:    Number of characters written to the buffer.
7213 + */
7214 +static int toi_cluster_print_debug_stats(char *buffer, int size)
7215 +{
7216 +       int len;
7217 +
7218 +       if (strlen(toi_cluster_iface))
7219 +               len = snprintf_used(buffer, size,
7220 +                               "- Cluster interface is '%s'.\n",
7221 +                               toi_cluster_iface);
7222 +       else
7223 +               len = snprintf_used(buffer, size,
7224 +                               "- Cluster support is disabled.\n");
7225 +       return len;
7226 +}
7227 +
7228 +/* cluster_memory_needed
7229 + *
7230 + * Description:        Tell the caller how much memory we need to operate during
7231 + *             hibernate/resume.
7232 + * Returns:    Unsigned long. Maximum number of bytes of memory required for
7233 + *             operation.
7234 + */
7235 +static int toi_cluster_memory_needed(void)
7236 +{
7237 +       return 0;
7238 +}
7239 +
7240 +static int toi_cluster_storage_needed(void)
7241 +{
7242 +       return 1 + strlen(toi_cluster_iface);
7243 +}
7244 +
7245 +/* toi_cluster_save_config_info
7246 + *
7247 + * Description:        Save informaton needed when reloading the image at resume time.
7248 + * Arguments:  Buffer:         Pointer to a buffer of size PAGE_SIZE.
7249 + * Returns:    Number of bytes used for saving our data.
7250 + */
7251 +static int toi_cluster_save_config_info(char *buffer)
7252 +{
7253 +       strcpy(buffer, toi_cluster_iface);
7254 +       return strlen(toi_cluster_iface + 1);
7255 +}
7256 +
7257 +/* toi_cluster_load_config_info
7258 + *
7259 + * Description:        Reload information needed for declustering the image at
7260 + *             resume time.
7261 + * Arguments:  Buffer:         Pointer to the start of the data.
7262 + *             Size:           Number of bytes that were saved.
7263 + */
7264 +static void toi_cluster_load_config_info(char *buffer, int size)
7265 +{
7266 +       strncpy(toi_cluster_iface, buffer, size);
7267 +       return;
7268 +}
7269 +
7270 +static void cluster_startup(void)
7271 +{
7272 +       int have_image = do_check_can_resume(), i;
7273 +       unsigned long start = jiffies, initial_message;
7274 +       struct task_struct *p;
7275 +
7276 +       initial_message = MSG_IMAGE;
7277 +
7278 +       have_image = 1;
7279 +
7280 +       for (i = 0; i < num_local_nodes; i++) {
7281 +               PRINTK("Starting ktoiclusterd %d.\n", i);
7282 +               p = kthread_create(kTOICluster, (void *) initial_message,
7283 +                               "ktoiclusterd/%d", i);
7284 +               if (IS_ERR(p)) {
7285 +                       printk("Failed to start ktoiclusterd.\n");
7286 +                       return;
7287 +               }
7288 +
7289 +               wake_up_process(p);
7290 +       }
7291 +
7292 +       /* Wait for delay or someone else sending first message */
7293 +       wait_event(node_array[0].member_events, time_to_continue(0, start,
7294 +                               MSG_IMAGE));
7295 +
7296 +       others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
7297 +
7298 +       printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
7299 +               " %d.\n", have_image ? "" : "don't ", others_have_image);
7300 +
7301 +       if (have_image) {
7302 +               int result;
7303 +
7304 +               /* Start to resume */
7305 +               printk(KERN_INFO "  === Starting to resume ===  \n");
7306 +               node_array[0].current_message = MSG_IO;
7307 +               toi_send_if(MSG_IO, 0);
7308 +
7309 +               /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
7310 +               result = 0;
7311 +
7312 +               if (!result) {
7313 +                       /*
7314 +                        * Atomic restore - we'll come back in the hibernation
7315 +                        * path.
7316 +                        */
7317 +
7318 +                       /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
7319 +                       result = 0;
7320 +
7321 +                       /* do_toi_step(STEP_QUIET_CLEANUP); */
7322 +               }
7323 +
7324 +               node_array[0].current_message |= MSG_NACK;
7325 +
7326 +               /* For debugging - disable for real life? */
7327 +               wait_event(node_array[0].member_events,
7328 +                               time_to_continue(0, start, MSG_IO));
7329 +       }
7330 +
7331 +       if (others_have_image) {
7332 +               /* Wait for them to resume */
7333 +               printk(KERN_INFO "Waiting for other nodes to resume.\n");
7334 +               start = jiffies;
7335 +               wait_event(node_array[0].member_events,
7336 +                               time_to_continue(0, start, MSG_RUNNING));
7337 +               if (peers_not_in_message(0, MSG_RUNNING, 0))
7338 +                       printk(KERN_INFO "Timed out while waiting for other "
7339 +                                       "nodes to resume.\n");
7340 +       }
7341 +
7342 +       /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
7343 +        * as appropriate.
7344 +        *
7345 +        * If we don't have an image:
7346 +        * - Wait until someone else says they have one, or conditions are met
7347 +        *   for continuing to boot (n machines or t seconds).
7348 +        * - If anyone has an image, wait for them to resume before continuing
7349 +        *   to boot.
7350 +        *
7351 +        * If we have an image:
7352 +        * - Wait until conditions are met before continuing to resume (n
7353 +        *   machines or t seconds). Send RESUME_PREP and freeze processes.
7354 +        *   NACK_PREP if freezing fails (shouldn't) and follow logic for
7355 +        *   us having no image above. On success, wait for [N]ACK_PREP from
7356 +        *   other machines. Read image (including atomic restore) until done.
7357 +        *   Wait for ACK_READ from others (should never fail). Thaw processes
7358 +        *   and do post-resume. (The section after the atomic restore is done
7359 +        *   via the code for hibernating).
7360 +        */
7361 +
7362 +       node_array[0].current_message = MSG_RUNNING;
7363 +}
7364 +
7365 +/* toi_cluster_open_iface
7366 + *
7367 + * Description:        Prepare to use an interface.
7368 + */
7369 +
7370 +static int toi_cluster_open_iface(void)
7371 +{
7372 +       struct net_device *dev;
7373 +
7374 +       rtnl_lock();
7375 +
7376 +       for_each_netdev(&init_net, dev) {
7377 +               if (/* dev == &init_net.loopback_dev || */
7378 +                   strcmp(dev->name, toi_cluster_iface))
7379 +                       continue;
7380 +
7381 +               net_dev = dev;
7382 +               break;
7383 +       }
7384 +
7385 +       rtnl_unlock();
7386 +
7387 +       if (!net_dev) {
7388 +               printk(KERN_ERR MYNAME ": Device %s not found.\n",
7389 +                               toi_cluster_iface);
7390 +               return -ENODEV;
7391 +       }
7392 +
7393 +       dev_add_pack(&toi_cluster_packet_type);
7394 +       added_pack = 1;
7395 +
7396 +       loopback_mode = (net_dev == init_net.loopback_dev);
7397 +       num_local_nodes = loopback_mode ? 8 : 1;
7398 +
7399 +       PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
7400 +                       loopback_mode ? "on" : "off", num_local_nodes);
7401 +
7402 +       cluster_startup();
7403 +       return 0;
7404 +}
7405 +
7406 +/* toi_cluster_close_iface
7407 + *
7408 + * Description: Stop using an interface.
7409 + */
7410 +
7411 +static int toi_cluster_close_iface(void)
7412 +{
7413 +       kill_clusterd();
7414 +       if (added_pack) {
7415 +               dev_remove_pack(&toi_cluster_packet_type);
7416 +               added_pack = 0;
7417 +       }
7418 +       return 0;
7419 +}
7420 +
7421 +static void write_side_effect(void)
7422 +{
7423 +       if (toi_cluster_ops.enabled) {
7424 +               toi_cluster_open_iface();
7425 +               set_toi_state(TOI_CLUSTER_MODE);
7426 +       } else {
7427 +               toi_cluster_close_iface();
7428 +               clear_toi_state(TOI_CLUSTER_MODE);
7429 +       }
7430 +}
7431 +
7432 +static void node_write_side_effect(void)
7433 +{
7434 +}
7435 +
7436 +/*
7437 + * data for our sysfs entries.
7438 + */
7439 +static struct toi_sysfs_data sysfs_params[] = {
7440 +       {
7441 +               TOI_ATTR("interface", SYSFS_RW),
7442 +               SYSFS_STRING(toi_cluster_iface, IFNAMSIZ, 0)
7443 +       },
7444 +
7445 +       {
7446 +               TOI_ATTR("enabled", SYSFS_RW),
7447 +               SYSFS_INT(&toi_cluster_ops.enabled, 0, 1, 0),
7448 +               .write_side_effect = write_side_effect,
7449 +       },
7450 +
7451 +       {
7452 +               TOI_ATTR("cluster_name", SYSFS_RW),
7453 +               SYSFS_STRING(toi_cluster_key, 32, 0)
7454 +       },
7455 +
7456 +       {
7457 +               TOI_ATTR("pre-hibernate-script", SYSFS_RW),
7458 +               SYSFS_STRING(pre_hibernate_script, 256, 0)
7459 +       },
7460 +
7461 +       {
7462 +               TOI_ATTR("post-hibernate-script", SYSFS_RW),
7463 +               SYSFS_STRING(post_hibernate_script, 256, 0)
7464 +       },
7465 +
7466 +       {
7467 +               TOI_ATTR("continue_delay", SYSFS_RW),
7468 +               SYSFS_UL(&continue_delay, HZ / 2, 60 * HZ, 0)
7469 +       }
7470 +};
7471 +
7472 +/*
7473 + * Ops structure.
7474 + */
7475 +
7476 +static struct toi_module_ops toi_cluster_ops = {
7477 +       .type                   = FILTER_MODULE,
7478 +       .name                   = "Cluster",
7479 +       .directory              = "cluster",
7480 +       .module                 = THIS_MODULE,
7481 +       .memory_needed          = toi_cluster_memory_needed,
7482 +       .print_debug_info       = toi_cluster_print_debug_stats,
7483 +       .save_config_info       = toi_cluster_save_config_info,
7484 +       .load_config_info       = toi_cluster_load_config_info,
7485 +       .storage_needed         = toi_cluster_storage_needed,
7486 +
7487 +       .sysfs_data             = sysfs_params,
7488 +       .num_sysfs_entries      = sizeof(sysfs_params) /
7489 +               sizeof(struct toi_sysfs_data),
7490 +};
7491 +
7492 +/* ---- Registration ---- */
7493 +
7494 +#ifdef MODULE
7495 +#define INIT static __init
7496 +#define EXIT static __exit
7497 +#else
7498 +#define INIT
7499 +#define EXIT
7500 +#endif
7501 +
7502 +INIT int toi_cluster_init(void)
7503 +{
7504 +       int temp = toi_register_module(&toi_cluster_ops), i;
7505 +       struct kobject *kobj = toi_cluster_ops.dir_kobj;
7506 +
7507 +       for (i = 0; i < MAX_LOCAL_NODES; i++) {
7508 +               node_array[i].current_message = 0;
7509 +               INIT_LIST_HEAD(&node_array[i].member_list);
7510 +               init_waitqueue_head(&node_array[i].member_events);
7511 +               spin_lock_init(&node_array[i].member_list_lock);
7512 +               spin_lock_init(&node_array[i].receive_lock);
7513 +
7514 +               /* Set up sysfs entry */
7515 +               node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
7516 +                               sizeof(node_array[i].sysfs_data.attr.name),
7517 +                               GFP_KERNEL);
7518 +               sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
7519 +                               i);
7520 +               node_array[i].sysfs_data.attr.mode = SYSFS_RW;
7521 +               node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
7522 +               node_array[i].sysfs_data.flags = 0;
7523 +               node_array[i].sysfs_data.data.integer.variable =
7524 +                       (int *) &node_array[i].current_message;
7525 +               node_array[i].sysfs_data.data.integer.minimum = 0;
7526 +               node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
7527 +               node_array[i].sysfs_data.write_side_effect =
7528 +                       node_write_side_effect;
7529 +               toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
7530 +       }
7531 +
7532 +       toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
7533 +
7534 +       if (toi_cluster_ops.enabled)
7535 +               toi_cluster_open_iface();
7536 +
7537 +       return temp;
7538 +}
7539 +
7540 +EXIT void toi_cluster_exit(void)
7541 +{
7542 +       int i;
7543 +       toi_cluster_close_iface();
7544 +
7545 +       for (i = 0; i < MAX_LOCAL_NODES; i++)
7546 +               toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
7547 +                               &node_array[i].sysfs_data);
7548 +       toi_unregister_module(&toi_cluster_ops);
7549 +}
7550 +
7551 +static int __init toi_cluster_iface_setup(char *iface)
7552 +{
7553 +       toi_cluster_ops.enabled = (*iface &&
7554 +                       strcmp(iface, "off"));
7555 +
7556 +       if (toi_cluster_ops.enabled)
7557 +               strncpy(toi_cluster_iface, iface, strlen(iface));
7558 +}
7559 +
7560 +__setup("toi_cluster=", toi_cluster_iface_setup);
7561 +
7562 +#ifdef MODULE
7563 +MODULE_LICENSE("GPL");
7564 +module_init(toi_cluster_init);
7565 +module_exit(toi_cluster_exit);
7566 +MODULE_AUTHOR("Nigel Cunningham");
7567 +MODULE_DESCRIPTION("Cluster Support for TuxOnIce");
7568 +#endif
7569 diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
7570 new file mode 100644
7571 index 0000000..cd9ee3a
7572 --- /dev/null
7573 +++ b/kernel/power/tuxonice_cluster.h
7574 @@ -0,0 +1,19 @@
7575 +/*
7576 + * kernel/power/tuxonice_cluster.h
7577 + *
7578 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
7579 + * Copyright (C) 2006 Red Hat, inc.
7580 + *
7581 + * This file is released under the GPLv2.
7582 + */
7583 +
7584 +#ifdef CONFIG_TOI_CLUSTER
7585 +extern int toi_cluster_init(void);
7586 +extern void toi_cluster_exit(void);
7587 +extern void toi_initiate_cluster_hibernate(void);
7588 +#else
7589 +static inline int toi_cluster_init(void) { return 0; }
7590 +static inline void toi_cluster_exit(void) { }
7591 +static inline void toi_initiate_cluster_hibernate(void) { }
7592 +#endif
7593 +
7594 diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
7595 new file mode 100644
7596 index 0000000..d2bbae9
7597 --- /dev/null
7598 +++ b/kernel/power/tuxonice_compress.c
7599 @@ -0,0 +1,441 @@
7600 +/*
7601 + * kernel/power/compression.c
7602 + *
7603 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
7604 + *
7605 + * This file is released under the GPLv2.
7606 + *
7607 + * This file contains data compression routines for TuxOnIce,
7608 + * using cryptoapi.
7609 + */
7610 +
7611 +#include <linux/module.h>
7612 +#include <linux/suspend.h>
7613 +#include <linux/highmem.h>
7614 +#include <linux/vmalloc.h>
7615 +#include <linux/crypto.h>
7616 +
7617 +#include "tuxonice_builtin.h"
7618 +#include "tuxonice.h"
7619 +#include "tuxonice_modules.h"
7620 +#include "tuxonice_sysfs.h"
7621 +#include "tuxonice_io.h"
7622 +#include "tuxonice_ui.h"
7623 +#include "tuxonice_alloc.h"
7624 +
7625 +static int toi_expected_compression;
7626 +
7627 +static struct toi_module_ops toi_compression_ops;
7628 +static struct toi_module_ops *next_driver;
7629 +
7630 +static char toi_compressor_name[32] = "lzf";
7631 +
7632 +static DEFINE_MUTEX(stats_lock);
7633 +
7634 +struct cpu_context {
7635 +       u8 *page_buffer;
7636 +       struct crypto_comp *transform;
7637 +       unsigned int len;
7638 +       char *buffer_start;
7639 +};
7640 +
7641 +static DEFINE_PER_CPU(struct cpu_context, contexts);
7642 +
7643 +static int toi_compress_prepare_result;
7644 +
7645 +/*
7646 + * toi_compress_cleanup
7647 + *
7648 + * Frees memory allocated for our labours.
7649 + */
7650 +static void toi_compress_cleanup(int toi_or_resume)
7651 +{
7652 +       int cpu;
7653 +
7654 +       if (!toi_or_resume)
7655 +               return;
7656 +
7657 +       for_each_online_cpu(cpu) {
7658 +               struct cpu_context *this = &per_cpu(contexts, cpu);
7659 +               if (this->transform) {
7660 +                       crypto_free_comp(this->transform);
7661 +                       this->transform = NULL;
7662 +               }
7663 +
7664 +               if (this->page_buffer)
7665 +                       toi_free_page(16, (unsigned long) this->page_buffer);
7666 +
7667 +               this->page_buffer = NULL;
7668 +       }
7669 +}
7670 +
7671 +/*
7672 + * toi_crypto_prepare
7673 + *
7674 + * Prepare to do some work by allocating buffers and transforms.
7675 + */
7676 +static int toi_compress_crypto_prepare(void)
7677 +{
7678 +       int cpu;
7679 +
7680 +       if (!*toi_compressor_name) {
7681 +               printk(KERN_INFO "TuxOnIce: Compression enabled but no "
7682 +                               "compressor name set.\n");
7683 +               return 1;
7684 +       }
7685 +
7686 +       for_each_online_cpu(cpu) {
7687 +               struct cpu_context *this = &per_cpu(contexts, cpu);
7688 +               this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
7689 +               if (IS_ERR(this->transform)) {
7690 +                       printk(KERN_INFO "TuxOnIce: Failed to initialise the "
7691 +                                       "%s compression transform.\n",
7692 +                                       toi_compressor_name);
7693 +                       this->transform = NULL;
7694 +                       return 1;
7695 +               }
7696 +
7697 +               this->page_buffer =
7698 +                       (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
7699 +
7700 +               if (!this->page_buffer) {
7701 +                       printk(KERN_ERR
7702 +                         "Failed to allocate a page buffer for TuxOnIce "
7703 +                         "encryption driver.\n");
7704 +                       return -ENOMEM;
7705 +               }
7706 +       }
7707 +
7708 +       return 0;
7709 +}
7710 +
7711 +/*
7712 + * toi_compress_init
7713 + */
7714 +
7715 +static int toi_compress_init(int toi_or_resume)
7716 +{
7717 +       if (!toi_or_resume)
7718 +               return 0;
7719 +
7720 +       toi_compress_bytes_in = toi_compress_bytes_out = 0;
7721 +
7722 +       next_driver = toi_get_next_filter(&toi_compression_ops);
7723 +
7724 +       if (!next_driver)
7725 +               return -ECHILD;
7726 +
7727 +       toi_compress_prepare_result = toi_compress_crypto_prepare();
7728 +
7729 +       return 0;
7730 +}
7731 +
7732 +/*
7733 + * toi_compress_rw_init()
7734 + */
7735 +
7736 +int toi_compress_rw_init(int rw, int stream_number)
7737 +{
7738 +       if (toi_compress_prepare_result) {
7739 +               printk("Failed to initialise compression algorithm.\n");
7740 +               if (rw == READ)
7741 +                       return -ENODEV;
7742 +               else
7743 +                       toi_compression_ops.enabled = 0;
7744 +       }
7745 +
7746 +       return 0;
7747 +}
7748 +
7749 +/*
7750 + * toi_compress_write_page()
7751 + *
7752 + * Compress a page of data, buffering output and passing on filled
7753 + * pages to the next module in the pipeline.
7754 + *
7755 + * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
7756 + * data to be compressed.
7757 + *
7758 + * Returns:    0 on success. Otherwise the error is that returned by later
7759 + *             modules, -ECHILD if we have a broken pipeline or -EIO if
7760 + *             zlib errs.
7761 + */
7762 +static int toi_compress_write_page(unsigned long index,
7763 +               struct page *buffer_page, unsigned int buf_size)
7764 +{
7765 +       int ret, cpu = smp_processor_id();
7766 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
7767 +
7768 +       if (!ctx->transform)
7769 +               return next_driver->write_page(index, buffer_page, buf_size);
7770 +
7771 +       ctx->buffer_start = kmap(buffer_page);
7772 +
7773 +       ctx->len = buf_size;
7774 +
7775 +       ret = crypto_comp_compress(ctx->transform,
7776 +                       ctx->buffer_start, buf_size,
7777 +                       ctx->page_buffer, &ctx->len);
7778 +
7779 +       kunmap(buffer_page);
7780 +
7781 +       if (ret) {
7782 +               printk(KERN_INFO "Compression failed.\n");
7783 +               goto failure;
7784 +       }
7785 +
7786 +       mutex_lock(&stats_lock);
7787 +       toi_compress_bytes_in += buf_size;
7788 +       toi_compress_bytes_out += ctx->len;
7789 +       mutex_unlock(&stats_lock);
7790 +
7791 +       if (ctx->len < buf_size) /* some compression */
7792 +               ret = next_driver->write_page(index,
7793 +                               virt_to_page(ctx->page_buffer),
7794 +                               ctx->len);
7795 +       else
7796 +               ret = next_driver->write_page(index, buffer_page, buf_size);
7797 +
7798 +failure:
7799 +       return ret;
7800 +}
7801 +
7802 +/*
7803 + * toi_compress_read_page()
7804 + * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
7805 + *
7806 + * Retrieve data from later modules and decompress it until the input buffer
7807 + * is filled.
7808 + * Zero if successful. Error condition from me or from downstream on failure.
7809 + */
7810 +static int toi_compress_read_page(unsigned long *index,
7811 +               struct page *buffer_page, unsigned int *buf_size)
7812 +{
7813 +       int ret, cpu = smp_processor_id();
7814 +       unsigned int len;
7815 +       unsigned int outlen = PAGE_SIZE;
7816 +       char *buffer_start;
7817 +       struct cpu_context *ctx = &per_cpu(contexts, cpu);
7818 +
7819 +       if (!ctx->transform)
7820 +               return next_driver->read_page(index, buffer_page, buf_size);
7821 +
7822 +       /*
7823 +        * All our reads must be synchronous - we can't decompress
7824 +        * data that hasn't been read yet.
7825 +        */
7826 +
7827 +       *buf_size = PAGE_SIZE;
7828 +
7829 +       ret = next_driver->read_page(index, buffer_page, &len);
7830 +
7831 +       /* Error or uncompressed data */
7832 +       if (ret || len == PAGE_SIZE)
7833 +               return ret;
7834 +
7835 +       buffer_start = kmap(buffer_page);
7836 +       memcpy(ctx->page_buffer, buffer_start, len);
7837 +       ret = crypto_comp_decompress(
7838 +                       ctx->transform,
7839 +                       ctx->page_buffer,
7840 +                       len, buffer_start, &outlen);
7841 +       if (ret)
7842 +               abort_hibernate(TOI_FAILED_IO,
7843 +                       "Compress_read returned %d.\n", ret);
7844 +       else if (outlen != PAGE_SIZE) {
7845 +               abort_hibernate(TOI_FAILED_IO,
7846 +                       "Decompression yielded %d bytes instead of %ld.\n",
7847 +                       outlen, PAGE_SIZE);
7848 +               printk("Decompression yielded %d bytes instead of %ld.\n",
7849 +                       outlen, PAGE_SIZE);
7850 +               ret = -EIO;
7851 +               *buf_size = outlen;
7852 +       }
7853 +       kunmap(buffer_page);
7854 +       return ret;
7855 +}
7856 +
7857 +/*
7858 + * toi_compress_print_debug_stats
7859 + * @buffer: Pointer to a buffer into which the debug info will be printed.
7860 + * @size: Size of the buffer.
7861 + *
7862 + * Print information to be recorded for debugging purposes into a buffer.
7863 + * Returns: Number of characters written to the buffer.
7864 + */
7865 +
7866 +static int toi_compress_print_debug_stats(char *buffer, int size)
7867 +{
7868 +       unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
7869 +                     pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
7870 +       int len;
7871 +
7872 +       /* Output the compression ratio achieved. */
7873 +       if (*toi_compressor_name)
7874 +               len = snprintf_used(buffer, size, "- Compressor is '%s'.\n",
7875 +                               toi_compressor_name);
7876 +       else
7877 +               len = snprintf_used(buffer, size, "- Compressor is not set.\n");
7878 +
7879 +       if (pages_in)
7880 +               len += snprintf_used(buffer+len, size - len,
7881 +                 "  Compressed %lu bytes into %lu (%d percent compression).\n",
7882 +                 toi_compress_bytes_in,
7883 +                 toi_compress_bytes_out,
7884 +                 (pages_in - pages_out) * 100 / pages_in);
7885 +       return len;
7886 +}
7887 +
7888 +/*
7889 + * toi_compress_compression_memory_needed
7890 + *
7891 + * Tell the caller how much memory we need to operate during hibernate/resume.
7892 + * Returns: Unsigned long. Maximum number of bytes of memory required for
7893 + * operation.
7894 + */
7895 +static int toi_compress_memory_needed(void)
7896 +{
7897 +       return 2 * PAGE_SIZE;
7898 +}
7899 +
7900 +static int toi_compress_storage_needed(void)
7901 +{
7902 +       return 4 * sizeof(unsigned long) + strlen(toi_compressor_name) + 1;
7903 +}
7904 +
7905 +/*
7906 + * toi_compress_save_config_info
7907 + * @buffer: Pointer to a buffer of size PAGE_SIZE.
7908 + *
7909 + * Save informaton needed when reloading the image at resume time.
7910 + * Returns: Number of bytes used for saving our data.
7911 + */
7912 +static int toi_compress_save_config_info(char *buffer)
7913 +{
7914 +       int namelen = strlen(toi_compressor_name) + 1;
7915 +       int total_len;
7916 +
7917 +       *((unsigned long *) buffer) = toi_compress_bytes_in;
7918 +       *((unsigned long *) (buffer + 1 * sizeof(unsigned long))) =
7919 +               toi_compress_bytes_out;
7920 +       *((unsigned long *) (buffer + 2 * sizeof(unsigned long))) =
7921 +               toi_expected_compression;
7922 +       *((unsigned long *) (buffer + 3 * sizeof(unsigned long))) = namelen;
7923 +       strncpy(buffer + 4 * sizeof(unsigned long), toi_compressor_name,
7924 +                                                               namelen);
7925 +       total_len = 4 * sizeof(unsigned long) + namelen;
7926 +       return total_len;
7927 +}
7928 +
7929 +/* toi_compress_load_config_info
7930 + * @buffer: Pointer to the start of the data.
7931 + * @size: Number of bytes that were saved.
7932 + *
7933 + * Description:        Reload information needed for decompressing the image at
7934 + * resume time.
7935 + */
7936 +static void toi_compress_load_config_info(char *buffer, int size)
7937 +{
7938 +       int namelen;
7939 +
7940 +       toi_compress_bytes_in = *((unsigned long *) buffer);
7941 +       toi_compress_bytes_out = *((unsigned long *) (buffer + 1 *
7942 +                               sizeof(unsigned long)));
7943 +       toi_expected_compression = *((unsigned long *) (buffer + 2 *
7944 +                               sizeof(unsigned long)));
7945 +       namelen = *((unsigned long *) (buffer + 3 * sizeof(unsigned long)));
7946 +       if (strncmp(toi_compressor_name, buffer + 4 * sizeof(unsigned long),
7947 +                               namelen)) {
7948 +               toi_compress_cleanup(1);
7949 +               strncpy(toi_compressor_name, buffer + 4 * sizeof(unsigned long),
7950 +                       namelen);
7951 +               toi_compress_crypto_prepare();
7952 +       }
7953 +       return;
7954 +}
7955 +
7956 +/*
7957 + * toi_expected_compression_ratio
7958 + *
7959 + * Description:        Returns the expected ratio between data passed into this module
7960 + *             and the amount of data output when writing.
7961 + * Returns:    100 if the module is disabled. Otherwise the value set by the
7962 + *             user via our sysfs entry.
7963 + */
7964 +
7965 +static int toi_compress_expected_ratio(void)
7966 +{
7967 +       if (!toi_compression_ops.enabled)
7968 +               return 100;
7969 +       else
7970 +               return 100 - toi_expected_compression;
7971 +}
7972 +
7973 +/*
7974 + * data for our sysfs entries.
7975 + */
7976 +static struct toi_sysfs_data sysfs_params[] = {
7977 +       {
7978 +               TOI_ATTR("expected_compression", SYSFS_RW),
7979 +               SYSFS_INT(&toi_expected_compression, 0, 99, 0)
7980 +       },
7981 +
7982 +       {
7983 +               TOI_ATTR("enabled", SYSFS_RW),
7984 +               SYSFS_INT(&toi_compression_ops.enabled, 0, 1, 0)
7985 +       },
7986 +
7987 +       {
7988 +               TOI_ATTR("algorithm", SYSFS_RW),
7989 +               SYSFS_STRING(toi_compressor_name, 31, 0)
7990 +       }
7991 +};
7992 +
7993 +/*
7994 + * Ops structure.
7995 + */
7996 +static struct toi_module_ops toi_compression_ops = {
7997 +       .type                   = FILTER_MODULE,
7998 +       .name                   = "compression",
7999 +       .directory              = "compression",
8000 +       .module                 = THIS_MODULE,
8001 +       .initialise             = toi_compress_init,
8002 +       .cleanup                = toi_compress_cleanup,
8003 +       .memory_needed          = toi_compress_memory_needed,
8004 +       .print_debug_info       = toi_compress_print_debug_stats,
8005 +       .save_config_info       = toi_compress_save_config_info,
8006 +       .load_config_info       = toi_compress_load_config_info,
8007 +       .storage_needed         = toi_compress_storage_needed,
8008 +       .expected_compression   = toi_compress_expected_ratio,
8009 +
8010 +       .rw_init                = toi_compress_rw_init,
8011 +
8012 +       .write_page             = toi_compress_write_page,
8013 +       .read_page              = toi_compress_read_page,
8014 +
8015 +       .sysfs_data             = sysfs_params,
8016 +       .num_sysfs_entries      = sizeof(sysfs_params) /
8017 +               sizeof(struct toi_sysfs_data),
8018 +};
8019 +
8020 +/* ---- Registration ---- */
8021 +
8022 +static __init int toi_compress_load(void)
8023 +{
8024 +       return toi_register_module(&toi_compression_ops);
8025 +}
8026 +
8027 +#ifdef MODULE
8028 +static __exit void toi_compress_unload(void)
8029 +{
8030 +       toi_unregister_module(&toi_compression_ops);
8031 +}
8032 +
8033 +module_init(toi_compress_load);
8034 +module_exit(toi_compress_unload);
8035 +MODULE_LICENSE("GPL");
8036 +MODULE_AUTHOR("Nigel Cunningham");
8037 +MODULE_DESCRIPTION("Compression Support for TuxOnIce");
8038 +#else
8039 +late_initcall(toi_compress_load);
8040 +#endif
8041 diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
8042 new file mode 100644
8043 index 0000000..afeb11d
8044 --- /dev/null
8045 +++ b/kernel/power/tuxonice_extent.c
8046 @@ -0,0 +1,303 @@
8047 +/*
8048 + * kernel/power/tuxonice_extent.c
8049 + *
8050 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
8051 + *
8052 + * Distributed under GPLv2.
8053 + *
8054 + * These functions encapsulate the manipulation of storage metadata. For
8055 + * pageflags, we use dynamically allocated bitmaps.
8056 + */
8057 +
8058 +#include <linux/module.h>
8059 +#include <linux/suspend.h>
8060 +#include "tuxonice_modules.h"
8061 +#include "tuxonice_extent.h"
8062 +#include "tuxonice_alloc.h"
8063 +#include "tuxonice_ui.h"
8064 +#include "tuxonice.h"
8065 +
8066 +/* toi_get_extent
8067 + *
8068 + * Returns a free extent. May fail, returning NULL instead.
8069 + */
8070 +static struct hibernate_extent *toi_get_extent(void)
8071 +{
8072 +       return (struct hibernate_extent *) toi_kzalloc(2,
8073 +                       sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
8074 +}
8075 +
8076 +/* toi_put_extent_chain.
8077 + *
8078 + * Frees a whole chain of extents.
8079 + */
8080 +void toi_put_extent_chain(struct hibernate_extent_chain *chain)
8081 +{
8082 +       struct hibernate_extent *this;
8083 +
8084 +       this = chain->first;
8085 +
8086 +       while (this) {
8087 +               struct hibernate_extent *next = this->next;
8088 +               toi_kfree(2, this);
8089 +               chain->num_extents--;
8090 +               this = next;
8091 +       }
8092 +
8093 +       chain->first = NULL;
8094 +       chain->last_touched = NULL;
8095 +       chain->size = 0;
8096 +}
8097 +
8098 +/*
8099 + * toi_add_to_extent_chain
8100 + *
8101 + * Add an extent to an existing chain.
8102 + */
8103 +int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
8104 +               unsigned long start, unsigned long end)
8105 +{
8106 +       struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
8107 +
8108 +       /* Find the right place in the chain */
8109 +       if (chain->last_touched && chain->last_touched->start < start)
8110 +               cur_ext = chain->last_touched;
8111 +       else if (chain->first && chain->first->start < start)
8112 +               cur_ext = chain->first;
8113 +
8114 +       if (cur_ext) {
8115 +               while (cur_ext->next && cur_ext->next->start < start)
8116 +                       cur_ext = cur_ext->next;
8117 +
8118 +               if (cur_ext->end == (start - 1)) {
8119 +                       struct hibernate_extent *next_ext = cur_ext->next;
8120 +                       cur_ext->end = end;
8121 +
8122 +                       /* Merge with the following one? */
8123 +                       if (next_ext && cur_ext->end + 1 == next_ext->start) {
8124 +                               cur_ext->end = next_ext->end;
8125 +                               cur_ext->next = next_ext->next;
8126 +                               toi_kfree(2, next_ext);
8127 +                               chain->num_extents--;
8128 +                       }
8129 +
8130 +                       chain->last_touched = cur_ext;
8131 +                       chain->size += (end - start + 1);
8132 +
8133 +                       return 0;
8134 +               }
8135 +       }
8136 +
8137 +       new_ext = toi_get_extent();
8138 +       if (!new_ext) {
8139 +               printk(KERN_INFO "Error unable to append a new extent to the "
8140 +                               "chain.\n");
8141 +               return -ENOMEM;
8142 +       }
8143 +
8144 +       chain->num_extents++;
8145 +       chain->size += (end - start + 1);
8146 +       new_ext->start = start;
8147 +       new_ext->end = end;
8148 +
8149 +       chain->last_touched = new_ext;
8150 +
8151 +       if (cur_ext) {
8152 +               new_ext->next = cur_ext->next;
8153 +               cur_ext->next = new_ext;
8154 +       } else {
8155 +               if (chain->first)
8156 +                       new_ext->next = chain->first;
8157 +               chain->first = new_ext;
8158 +       }
8159 +
8160 +       return 0;
8161 +}
8162 +
8163 +/* toi_serialise_extent_chain
8164 + *
8165 + * Write a chain in the image.
8166 + */
8167 +int toi_serialise_extent_chain(struct toi_module_ops *owner,
8168 +               struct hibernate_extent_chain *chain)
8169 +{
8170 +       struct hibernate_extent *this;
8171 +       int ret, i = 0;
8172 +
8173 +       ret = toiActiveAllocator->rw_header_chunk(WRITE, owner, (char *) chain,
8174 +                       2 * sizeof(int));
8175 +       if (ret)
8176 +               return ret;
8177 +
8178 +       this = chain->first;
8179 +       while (this) {
8180 +               ret = toiActiveAllocator->rw_header_chunk(WRITE, owner,
8181 +                               (char *) this, 2 * sizeof(unsigned long));
8182 +               if (ret)
8183 +                       return ret;
8184 +               this = this->next;
8185 +               i++;
8186 +       }
8187 +
8188 +       if (i != chain->num_extents) {
8189 +               printk(KERN_EMERG "Saved %d extents but chain metadata says "
8190 +                       "there should be %d.\n", i, chain->num_extents);
8191 +               return 1;
8192 +       }
8193 +
8194 +       return ret;
8195 +}
8196 +
8197 +/* toi_load_extent_chain
8198 + *
8199 + * Read back a chain saved in the image.
8200 + */
8201 +int toi_load_extent_chain(struct hibernate_extent_chain *chain)
8202 +{
8203 +       struct hibernate_extent *this, *last = NULL;
8204 +       int i, ret;
8205 +
8206 +       ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
8207 +                       (char *) chain, 2 * sizeof(int));
8208 +       if (ret) {
8209 +               printk("Failed to read size of extent chain.\n");
8210 +               return 1;
8211 +       }
8212 +
8213 +       for (i = 0; i < chain->num_extents; i++) {
8214 +               this = toi_kzalloc(3, sizeof(struct hibernate_extent),
8215 +                               TOI_ATOMIC_GFP);
8216 +               if (!this) {
8217 +                       printk(KERN_INFO "Failed to allocate a new extent.\n");
8218 +                       return -ENOMEM;
8219 +               }
8220 +               this->next = NULL;
8221 +               ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
8222 +                               NULL, (char *) this, 2 * sizeof(unsigned long));
8223 +               if (ret) {
8224 +                       printk(KERN_INFO "Failed to an extent.\n");
8225 +                       return 1;
8226 +               }
8227 +               if (last)
8228 +                       last->next = this;
8229 +               else
8230 +                       chain->first = this;
8231 +               last = this;
8232 +       }
8233 +       return 0;
8234 +}
8235 +
8236 +/* toi_extent_state_next
8237 + *
8238 + * Given a state, progress to the next valid entry. We may begin in an
8239 + * invalid state, as we do when invoked after extent_state_goto_start below.
8240 + *
8241 + * When using compression and expected_compression > 0, we let the image size
8242 + * be larger than storage, so we can validly run out of data to return.
8243 + */
8244 +unsigned long toi_extent_state_next(struct hibernate_extent_iterate_state *state)
8245 +{
8246 +       if (state->current_chain == state->num_chains)
8247 +               return 0;
8248 +
8249 +       if (state->current_extent) {
8250 +               if (state->current_offset == state->current_extent->end) {
8251 +                       if (state->current_extent->next) {
8252 +                               state->current_extent =
8253 +                                       state->current_extent->next;
8254 +                               state->current_offset =
8255 +                                       state->current_extent->start;
8256 +                       } else {
8257 +                               state->current_extent = NULL;
8258 +                               state->current_offset = 0;
8259 +                       }
8260 +               } else
8261 +                       state->current_offset++;
8262 +       }
8263 +
8264 +       while (!state->current_extent) {
8265 +               int chain_num = ++(state->current_chain);
8266 +
8267 +               if (chain_num == state->num_chains)
8268 +                       return 0;
8269 +
8270 +               state->current_extent = (state->chains + chain_num)->first;
8271 +
8272 +               if (!state->current_extent)
8273 +                       continue;
8274 +
8275 +               state->current_offset = state->current_extent->start;
8276 +       }
8277 +
8278 +       return state->current_offset;
8279 +}
8280 +
8281 +/* toi_extent_state_goto_start
8282 + *
8283 + * Find the first valid value in a group of chains.
8284 + */
8285 +void toi_extent_state_goto_start(struct hibernate_extent_iterate_state *state)
8286 +{
8287 +       state->current_chain = -1;
8288 +       state->current_extent = NULL;
8289 +       state->current_offset = 0;
8290 +}
8291 +
8292 +/* toi_extent_start_save
8293 + *
8294 + * Given a state and a struct hibernate_extent_state_store, save the current
8295 + * position in a format that can be used with relocated chains (at
8296 + * resume time).
8297 + */
8298 +void toi_extent_state_save(struct hibernate_extent_iterate_state *state,
8299 +               struct hibernate_extent_iterate_saved_state *saved_state)
8300 +{
8301 +       struct hibernate_extent *extent;
8302 +
8303 +       saved_state->chain_num = state->current_chain;
8304 +       saved_state->extent_num = 0;
8305 +       saved_state->offset = state->current_offset;
8306 +
8307 +       if (saved_state->chain_num == -1)
8308 +               return;
8309 +
8310 +       extent = (state->chains + state->current_chain)->first;
8311 +
8312 +       while (extent != state->current_extent) {
8313 +               saved_state->extent_num++;
8314 +               extent = extent->next;
8315 +       }
8316 +}
8317 +
8318 +/* toi_extent_start_restore
8319 + *
8320 + * Restore the position saved by extent_state_save.
8321 + */
8322 +void toi_extent_state_restore(struct hibernate_extent_iterate_state *state,
8323 +               struct hibernate_extent_iterate_saved_state *saved_state)
8324 +{
8325 +       int posn = saved_state->extent_num;
8326 +
8327 +       if (saved_state->chain_num == -1) {
8328 +               toi_extent_state_goto_start(state);
8329 +               return;
8330 +       }
8331 +
8332 +       state->current_chain = saved_state->chain_num;
8333 +       state->current_extent = (state->chains + state->current_chain)->first;
8334 +       state->current_offset = saved_state->offset;
8335 +
8336 +       while (posn--)
8337 +               state->current_extent = state->current_extent->next;
8338 +}
8339 +
8340 +#ifdef CONFIG_TOI_EXPORTS
8341 +EXPORT_SYMBOL_GPL(toi_add_to_extent_chain);
8342 +EXPORT_SYMBOL_GPL(toi_put_extent_chain);
8343 +EXPORT_SYMBOL_GPL(toi_load_extent_chain);
8344 +EXPORT_SYMBOL_GPL(toi_serialise_extent_chain);
8345 +EXPORT_SYMBOL_GPL(toi_extent_state_save);
8346 +EXPORT_SYMBOL_GPL(toi_extent_state_restore);
8347 +EXPORT_SYMBOL_GPL(toi_extent_state_goto_start);
8348 +EXPORT_SYMBOL_GPL(toi_extent_state_next);
8349 +#endif
8350 diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
8351 new file mode 100644
8352 index 0000000..dc0e05e
8353 --- /dev/null
8354 +++ b/kernel/power/tuxonice_extent.h
8355 @@ -0,0 +1,72 @@
8356 +/*
8357 + * kernel/power/tuxonice_extent.h
8358 + *
8359 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
8360 + *
8361 + * This file is released under the GPLv2.
8362 + *
8363 + * It contains declarations related to extents. Extents are
8364 + * TuxOnIce's method of storing some of the metadata for the image.
8365 + * See tuxonice_extent.c for more info.
8366 + *
8367 + */
8368 +
8369 +#include "tuxonice_modules.h"
8370 +
8371 +#ifndef EXTENT_H
8372 +#define EXTENT_H
8373 +
8374 +struct hibernate_extent {
8375 +       unsigned long start, end;
8376 +       struct hibernate_extent *next;
8377 +};
8378 +
8379 +struct hibernate_extent_chain {
8380 +       int size; /* size of the chain ie sum (max-min+1) */
8381 +       int num_extents;
8382 +       struct hibernate_extent *first, *last_touched;
8383 +};
8384 +
8385 +struct hibernate_extent_iterate_state {
8386 +       struct hibernate_extent_chain *chains;
8387 +       int num_chains;
8388 +       int current_chain;
8389 +       struct hibernate_extent *current_extent;
8390 +       unsigned long current_offset;
8391 +};
8392 +
8393 +struct hibernate_extent_iterate_saved_state {
8394 +       int chain_num;
8395 +       int extent_num;
8396 +       unsigned long offset;
8397 +};
8398 +
8399 +#define toi_extent_state_eof(state) \
8400 +       ((state)->num_chains == (state)->current_chain)
8401 +
8402 +/* Simplify iterating through all the values in an extent chain */
8403 +#define toi_extent_for_each(extent_chain, extentpointer, value) \
8404 +if ((extent_chain)->first) \
8405 +       for ((extentpointer) = (extent_chain)->first, (value) = \
8406 +                       (extentpointer)->start; \
8407 +            ((extentpointer) && ((extentpointer)->next || (value) <= \
8408 +                                (extentpointer)->end)); \
8409 +            (((value) == (extentpointer)->end) ? \
8410 +               ((extentpointer) = (extentpointer)->next, (value) = \
8411 +                ((extentpointer) ? (extentpointer)->start : 0)) : \
8412 +                       (value)++))
8413 +
8414 +void toi_put_extent_chain(struct hibernate_extent_chain *chain);
8415 +int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
8416 +               unsigned long start, unsigned long end);
8417 +int toi_serialise_extent_chain(struct toi_module_ops *owner,
8418 +               struct hibernate_extent_chain *chain);
8419 +int toi_load_extent_chain(struct hibernate_extent_chain *chain);
8420 +
8421 +void toi_extent_state_save(struct hibernate_extent_iterate_state *state,
8422 +               struct hibernate_extent_iterate_saved_state *saved_state);
8423 +void toi_extent_state_restore(struct hibernate_extent_iterate_state *state,
8424 +               struct hibernate_extent_iterate_saved_state *saved_state);
8425 +void toi_extent_state_goto_start(struct hibernate_extent_iterate_state *state);
8426 +unsigned long toi_extent_state_next(struct hibernate_extent_iterate_state *state);
8427 +#endif
8428 diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
8429 new file mode 100644
8430 index 0000000..15a57e2
8431 --- /dev/null
8432 +++ b/kernel/power/tuxonice_file.c
8433 @@ -0,0 +1,1125 @@
8434 +/*
8435 + * kernel/power/tuxonice_file.c
8436 + *
8437 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
8438 + *
8439 + * Distributed under GPLv2.
8440 + *
8441 + * This file encapsulates functions for usage of a simple file as a
8442 + * backing store. It is based upon the swapallocator, and shares the
8443 + * same basic working. Here, though, we have nothing to do with
8444 + * swapspace, and only one device to worry about.
8445 + *
8446 + * The user can just
8447 + *
8448 + * echo TuxOnIce > /path/to/my_file
8449 + *
8450 + * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
8451 + *
8452 + * and
8453 + *
8454 + * echo /path/to/my_file > /sys/power/tuxonice/file/target
8455 + *
8456 + * then put what they find in /sys/power/tuxonice/resume
8457 + * as their resume= parameter in lilo.conf (and rerun lilo if using it).
8458 + *
8459 + * Having done this, they're ready to hibernate and resume.
8460 + *
8461 + * TODO:
8462 + * - File resizing.
8463 + */
8464 +
8465 +#include <linux/suspend.h>
8466 +#include <linux/module.h>
8467 +#include <linux/blkdev.h>
8468 +#include <linux/file.h>
8469 +#include <linux/stat.h>
8470 +#include <linux/mount.h>
8471 +#include <linux/statfs.h>
8472 +#include <linux/syscalls.h>
8473 +#include <linux/namei.h>
8474 +#include <linux/fs.h>
8475 +#include <linux/root_dev.h>
8476 +
8477 +#include "tuxonice.h"
8478 +#include "tuxonice_sysfs.h"
8479 +#include "tuxonice_modules.h"
8480 +#include "tuxonice_ui.h"
8481 +#include "tuxonice_extent.h"
8482 +#include "tuxonice_io.h"
8483 +#include "tuxonice_storage.h"
8484 +#include "tuxonice_block_io.h"
8485 +#include "tuxonice_alloc.h"
8486 +
8487 +static struct toi_module_ops toi_fileops;
8488 +
8489 +/* Details of our target.  */
8490 +
8491 +char toi_file_target[256];
8492 +static struct inode *target_inode;
8493 +static struct file *target_file;
8494 +static struct block_device *toi_file_target_bdev;
8495 +static dev_t resume_file_dev_t;
8496 +static int used_devt;
8497 +static int setting_toi_file_target;
8498 +static sector_t target_firstblock, target_header_start;
8499 +static int target_storage_available;
8500 +static int target_claim;
8501 +
8502 +/* Old signatures */
8503 +static char HaveImage[] = "HaveImage\n";
8504 +static char NoImage[] =   "TuxOnIce\n";
8505 +#define sig_size (sizeof(HaveImage) + 1)
8506 +
8507 +struct toi_file_header {
8508 +       char sig[sig_size];
8509 +       int resumed_before;
8510 +       unsigned long first_header_block;
8511 +       int have_image;
8512 +};
8513 +
8514 +/* Header Page Information */
8515 +static int header_pages_reserved;
8516 +
8517 +/* Main Storage Pages */
8518 +static int main_pages_allocated, main_pages_requested;
8519 +
8520 +#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
8521 +
8522 +static struct toi_bdev_info devinfo;
8523 +
8524 +/* Extent chain for blocks */
8525 +static struct hibernate_extent_chain block_chain;
8526 +
8527 +/* Signature operations */
8528 +enum {
8529 +       GET_IMAGE_EXISTS,
8530 +       INVALIDATE,
8531 +       MARK_RESUME_ATTEMPTED,
8532 +       UNMARK_RESUME_ATTEMPTED,
8533 +};
8534 +
8535 +static void set_devinfo(struct block_device *bdev, int target_blkbits)
8536 +{
8537 +       devinfo.bdev = bdev;
8538 +       if (!target_blkbits) {
8539 +               devinfo.bmap_shift = devinfo.blocks_per_page = 0;
8540 +       } else {
8541 +               devinfo.bmap_shift = target_blkbits - 9;
8542 +               devinfo.blocks_per_page = (1 << (PAGE_SHIFT - target_blkbits));
8543 +       }
8544 +}
8545 +
8546 +static long raw_to_real(long raw)
8547 +{
8548 +       long result;
8549 +
8550 +       result = raw - (raw * (sizeof(unsigned long) + sizeof(int)) +
8551 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
8552 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
8553 +
8554 +       return result < 0 ? 0 : result;
8555 +}
8556 +
8557 +static int toi_file_storage_available(void)
8558 +{
8559 +       int result = 0;
8560 +       struct block_device *bdev = toi_file_target_bdev;
8561 +
8562 +       if (!target_inode)
8563 +               return 0;
8564 +
8565 +       switch (target_inode->i_mode & S_IFMT) {
8566 +       case S_IFSOCK:
8567 +       case S_IFCHR:
8568 +       case S_IFIFO: /* Socket, Char, Fifo */
8569 +               return -1;
8570 +       case S_IFREG: /* Regular file: current size - holes + free
8571 +                        space on part */
8572 +               result = target_storage_available;
8573 +               break;
8574 +       case S_IFBLK: /* Block device */
8575 +               if (!bdev->bd_disk) {
8576 +                       printk(KERN_INFO "bdev->bd_disk null.\n");
8577 +                       return 0;
8578 +               }
8579 +
8580 +               result = (bdev->bd_part ?
8581 +                       bdev->bd_part->nr_sects :
8582 +                       bdev->bd_disk->capacity) >> (PAGE_SHIFT - 9);
8583 +       }
8584 +
8585 +       return raw_to_real(result);
8586 +}
8587 +
8588 +static int has_contiguous_blocks(int page_num)
8589 +{
8590 +       int j;
8591 +       sector_t last = 0;
8592 +
8593 +       for (j = 0; j < devinfo.blocks_per_page; j++) {
8594 +               sector_t this = bmap(target_inode,
8595 +                               page_num * devinfo.blocks_per_page + j);
8596 +
8597 +               if (!this || (last && (last + 1) != this))
8598 +                       break;
8599 +
8600 +               last = this;
8601 +       }
8602 +
8603 +       return j == devinfo.blocks_per_page;
8604 +}
8605 +
8606 +static int size_ignoring_ignored_pages(void)
8607 +{
8608 +       int mappable = 0, i;
8609 +
8610 +       if (!target_is_normal_file())
8611 +               return toi_file_storage_available();
8612 +
8613 +       for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++)
8614 +               if (has_contiguous_blocks(i))
8615 +                       mappable++;
8616 +
8617 +       return mappable;
8618 +}
8619 +
8620 +static int __populate_block_list(int min, int max)
8621 +{
8622 +       if (test_action_state(TOI_TEST_BIO))
8623 +               printk(KERN_INFO "Adding extent %d-%d.\n",
8624 +                       min << devinfo.bmap_shift,
8625 +                       ((max + 1) << devinfo.bmap_shift) - 1);
8626 +
8627 +       return toi_add_to_extent_chain(&block_chain, min, max);
8628 +}
8629 +
8630 +static int apply_header_reservation(void)
8631 +{
8632 +       int i;
8633 +
8634 +       /* Apply header space reservation */
8635 +       toi_extent_state_goto_start(&toi_writer_posn);
8636 +       toi_bio_ops.forward_one_page(1); /* To first page */
8637 +
8638 +       for (i = 0; i < header_pages_reserved; i++)
8639 +               if (toi_bio_ops.forward_one_page(1))
8640 +                       return -ENOSPC;
8641 +
8642 +       /* The end of header pages will be the start of pageset 2 */
8643 +       toi_extent_state_save(&toi_writer_posn, &toi_writer_posn_save[2]);
8644 +
8645 +       return 0;
8646 +}
8647 +
8648 +static int populate_block_list(void)
8649 +{
8650 +       int i, extent_min = -1, extent_max = -1, got_header = 0, result = 0;
8651 +
8652 +       if (block_chain.first)
8653 +               toi_put_extent_chain(&block_chain);
8654 +
8655 +       if (!target_is_normal_file()) {
8656 +               return (target_storage_available > 0) ?
8657 +                       __populate_block_list(devinfo.blocks_per_page,
8658 +                               (target_storage_available + 1) *
8659 +                               devinfo.blocks_per_page - 1) : 0;
8660 +       }
8661 +
8662 +       for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
8663 +               sector_t new_sector;
8664 +
8665 +               if (!has_contiguous_blocks(i))
8666 +                       continue;
8667 +
8668 +               new_sector = bmap(target_inode,
8669 +               (i * devinfo.blocks_per_page));
8670 +
8671 +               /*
8672 +                * Ignore the first block in the file.
8673 +                * It gets the header.
8674 +                */
8675 +               if (new_sector == target_firstblock >> devinfo.bmap_shift) {
8676 +                       got_header = 1;
8677 +                       continue;
8678 +               }
8679 +
8680 +               /*
8681 +                * I'd love to be able to fill in holes and resize
8682 +                * files, but not yet...
8683 +                */
8684 +
8685 +               if (new_sector == extent_max + 1)
8686 +                       extent_max += devinfo.blocks_per_page;
8687 +               else {
8688 +                       if (extent_min > -1) {
8689 +                               result = __populate_block_list(extent_min,
8690 +                                               extent_max);
8691 +                               if (result)
8692 +                                       return result;
8693 +                       }
8694 +
8695 +                       extent_min = new_sector;
8696 +                       extent_max = extent_min +
8697 +                               devinfo.blocks_per_page - 1;
8698 +               }
8699 +       }
8700 +
8701 +       if (extent_min > -1) {
8702 +               result = __populate_block_list(extent_min, extent_max);
8703 +               if (result)
8704 +                       return result;
8705 +       }
8706 +
8707 +       return apply_header_reservation();
8708 +}
8709 +
8710 +static void toi_file_cleanup(int finishing_cycle)
8711 +{
8712 +       if (toi_file_target_bdev) {
8713 +               if (target_claim) {
8714 +                       bd_release(toi_file_target_bdev);
8715 +                       target_claim = 0;
8716 +               }
8717 +
8718 +               if (used_devt) {
8719 +                       blkdev_put(toi_file_target_bdev);
8720 +                       used_devt = 0;
8721 +               }
8722 +               toi_file_target_bdev = NULL;
8723 +               target_inode = NULL;
8724 +               set_devinfo(NULL, 0);
8725 +               target_storage_available = 0;
8726 +       }
8727 +
8728 +       if (target_file > 0) {
8729 +               filp_close(target_file, NULL);
8730 +               target_file = NULL;
8731 +       }
8732 +}
8733 +
8734 +/*
8735 + * reopen_resume_devt
8736 + *
8737 + * Having opened resume= once, we remember the major and
8738 + * minor nodes and use them to reopen the bdev for checking
8739 + * whether an image exists (possibly when starting a resume).
8740 + */
8741 +static void reopen_resume_devt(void)
8742 +{
8743 +       toi_file_target_bdev = toi_open_by_devnum(resume_file_dev_t,
8744 +                       FMODE_READ);
8745 +       if (IS_ERR(toi_file_target_bdev)) {
8746 +               printk(KERN_INFO "Got a dev_num (%lx) but failed to open it.\n",
8747 +                               (unsigned long) resume_file_dev_t);
8748 +               return;
8749 +       }
8750 +       target_inode = toi_file_target_bdev->bd_inode;
8751 +       set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
8752 +}
8753 +
8754 +static void toi_file_get_target_info(char *target, int get_size,
8755 +               int resume_param)
8756 +{
8757 +       if (target_file)
8758 +               toi_file_cleanup(0);
8759 +
8760 +       if (!target || !strlen(target))
8761 +               return;
8762 +
8763 +       target_file = filp_open(target, O_RDWR|O_LARGEFILE, 0);
8764 +
8765 +       if (IS_ERR(target_file) || !target_file) {
8766 +
8767 +               if (!resume_param) {
8768 +                       printk(KERN_INFO "Open file %s returned %p.\n",
8769 +                                       target, target_file);
8770 +                       target_file = NULL;
8771 +                       return;
8772 +               }
8773 +
8774 +               target_file = NULL;
8775 +               resume_file_dev_t = name_to_dev_t(target);
8776 +               if (!resume_file_dev_t) {
8777 +                       struct kstat stat;
8778 +                       int error = vfs_stat(target, &stat);
8779 +                       printk(KERN_INFO "Open file %s returned %p and "
8780 +                                       "name_to_devt failed.\n", target,
8781 +                                       target_file);
8782 +                       if (error)
8783 +                               printk(KERN_INFO "Stating the file also failed."
8784 +                                       " Nothing more we can do.\n");
8785 +                       else
8786 +                               resume_file_dev_t = stat.rdev;
8787 +                       return;
8788 +               }
8789 +
8790 +               toi_file_target_bdev = toi_open_by_devnum(resume_file_dev_t,
8791 +                               FMODE_READ);
8792 +               if (IS_ERR(toi_file_target_bdev)) {
8793 +                       printk(KERN_INFO "Got a dev_num (%lx) but failed to "
8794 +                                       "open it.\n",
8795 +                                       (unsigned long) resume_file_dev_t);
8796 +                       return;
8797 +               }
8798 +               used_devt = 1;
8799 +               target_inode = toi_file_target_bdev->bd_inode;
8800 +       } else
8801 +               target_inode = target_file->f_mapping->host;
8802 +
8803 +       if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
8804 +           S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
8805 +               printk(KERN_INFO "File support works with regular files,"
8806 +                               " character files and block devices.\n");
8807 +               goto cleanup;
8808 +       }
8809 +
8810 +       if (!used_devt) {
8811 +               if (S_ISBLK(target_inode->i_mode)) {
8812 +                       toi_file_target_bdev = I_BDEV(target_inode);
8813 +                       if (!bd_claim(toi_file_target_bdev, &toi_fileops))
8814 +                               target_claim = 1;
8815 +               } else
8816 +                       toi_file_target_bdev = target_inode->i_sb->s_bdev;
8817 +               resume_file_dev_t = toi_file_target_bdev->bd_dev;
8818 +       }
8819 +
8820 +       set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
8821 +
8822 +       if (get_size)
8823 +               target_storage_available = size_ignoring_ignored_pages();
8824 +
8825 +       if (!resume_param)
8826 +               target_firstblock = bmap(target_inode, 0) << devinfo.bmap_shift;
8827 +
8828 +       return;
8829 +cleanup:
8830 +       target_inode = NULL;
8831 +       if (target_file) {
8832 +               filp_close(target_file, NULL);
8833 +               target_file = NULL;
8834 +       }
8835 +       set_devinfo(NULL, 0);
8836 +       target_storage_available = 0;
8837 +}
8838 +
8839 +static void toi_file_noresume_reset(void)
8840 +{
8841 +       toi_bio_ops.rw_cleanup(READ);
8842 +}
8843 +
8844 +static int parse_signature(struct toi_file_header *header)
8845 +{
8846 +       int have_image = !memcmp(HaveImage, header->sig, sizeof(HaveImage) - 1);
8847 +       int no_image_header = !memcmp(NoImage, header->sig,
8848 +                       sizeof(NoImage) - 1);
8849 +       int binary_sig = !memcmp(tuxonice_signature, header->sig,
8850 +                       sizeof(tuxonice_signature));
8851 +
8852 +       if (no_image_header || (binary_sig && !header->have_image))
8853 +               return 0;
8854 +
8855 +       if (!have_image && !binary_sig)
8856 +               return -1;
8857 +
8858 +       if (header->resumed_before)
8859 +               set_toi_state(TOI_RESUMED_BEFORE);
8860 +       else
8861 +               clear_toi_state(TOI_RESUMED_BEFORE);
8862 +
8863 +       target_header_start = header->first_header_block;
8864 +       return 1;
8865 +}
8866 +
8867 +/* prepare_signature */
8868 +
8869 +static int prepare_signature(struct toi_file_header *current_header,
8870 +               unsigned long first_header_block)
8871 +{
8872 +       strncpy(current_header->sig, tuxonice_signature,
8873 +                       sizeof(tuxonice_signature));
8874 +       current_header->resumed_before = 0;
8875 +       current_header->first_header_block = first_header_block;
8876 +       current_header->have_image = 1;
8877 +       return 0;
8878 +}
8879 +
8880 +static int toi_file_storage_allocated(void)
8881 +{
8882 +       if (!target_inode)
8883 +               return 0;
8884 +
8885 +       if (target_is_normal_file())
8886 +               return (int) raw_to_real(target_storage_available);
8887 +       else
8888 +               return (int) raw_to_real(main_pages_requested);
8889 +}
8890 +
8891 +static int toi_file_release_storage(void)
8892 +{
8893 +       if (test_action_state(TOI_KEEP_IMAGE) &&
8894 +           test_toi_state(TOI_NOW_RESUMING))
8895 +               return 0;
8896 +
8897 +       toi_put_extent_chain(&block_chain);
8898 +
8899 +       header_pages_reserved = 0;
8900 +       main_pages_allocated = 0;
8901 +       main_pages_requested = 0;
8902 +       return 0;
8903 +}
8904 +
8905 +static void toi_file_reserve_header_space(int request)
8906 +{
8907 +       header_pages_reserved = request;
8908 +       apply_header_reservation();
8909 +}
8910 +
8911 +static int toi_file_allocate_storage(int main_space_requested)
8912 +{
8913 +       int result = 0;
8914 +
8915 +       int extra_pages = DIV_ROUND_UP(main_space_requested *
8916 +                       (sizeof(unsigned long) + sizeof(int)), PAGE_SIZE);
8917 +       int pages_to_get = main_space_requested + extra_pages +
8918 +               header_pages_reserved;
8919 +       int blocks_to_get = pages_to_get - block_chain.size;
8920 +
8921 +       /* Only release_storage reduces the size */
8922 +       if (blocks_to_get < 1)
8923 +               return 0;
8924 +
8925 +       result = populate_block_list();
8926 +
8927 +       if (result)
8928 +               return result;
8929 +
8930 +       toi_message(TOI_WRITER, TOI_MEDIUM, 0,
8931 +               "Finished with block_chain.size == %d.\n",
8932 +               block_chain.size);
8933 +
8934 +       if (block_chain.size < pages_to_get) {
8935 +               printk("Block chain size (%d) < header pages (%d) + extra "
8936 +                       "pages (%d) + main pages (%d) (=%d pages).\n",
8937 +                       block_chain.size, header_pages_reserved, extra_pages,
8938 +                       main_space_requested, pages_to_get);
8939 +               result = -ENOSPC;
8940 +       }
8941 +
8942 +       main_pages_requested = main_space_requested;
8943 +       main_pages_allocated = main_space_requested + extra_pages;
8944 +       return result;
8945 +}
8946 +
8947 +static int toi_file_write_header_init(void)
8948 +{
8949 +       int result;
8950 +
8951 +       toi_bio_ops.rw_init(WRITE, 0);
8952 +       toi_writer_buffer_posn = 0;
8953 +
8954 +       /* Info needed to bootstrap goes at the start of the header.
8955 +        * First we save the basic info needed for reading, including the number
8956 +        * of header pages. Then we save the structs containing data needed
8957 +        * for reading the header pages back.
8958 +        * Note that even if header pages take more than one page, when we
8959 +        * read back the info, we will have restored the location of the
8960 +        * next header page by the time we go to use it.
8961 +        */
8962 +
8963 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_fileops,
8964 +                       (char *) &toi_writer_posn_save,
8965 +                       sizeof(toi_writer_posn_save));
8966 +
8967 +       if (result)
8968 +               return result;
8969 +
8970 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_fileops,
8971 +                       (char *) &devinfo, sizeof(devinfo));
8972 +
8973 +       if (result)
8974 +               return result;
8975 +
8976 +       toi_serialise_extent_chain(&toi_fileops, &block_chain);
8977 +
8978 +       return 0;
8979 +}
8980 +
8981 +static int toi_file_write_header_cleanup(void)
8982 +{
8983 +       struct toi_file_header *header;
8984 +       int result;
8985 +       unsigned long sig_page = toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
8986 +
8987 +       /* Write any unsaved data */
8988 +       if (toi_writer_buffer_posn)
8989 +               toi_bio_ops.write_header_chunk_finish();
8990 +
8991 +       toi_bio_ops.finish_all_io();
8992 +
8993 +       toi_extent_state_goto_start(&toi_writer_posn);
8994 +       toi_bio_ops.forward_one_page(1);
8995 +
8996 +       /* Adjust image header */
8997 +       result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
8998 +                       target_firstblock,
8999 +                       virt_to_page(sig_page));
9000 +       if (result)
9001 +               goto out;
9002 +
9003 +       header = (struct toi_file_header *) sig_page;
9004 +
9005 +       prepare_signature(header,
9006 +                       toi_writer_posn.current_offset <<
9007 +                       devinfo.bmap_shift);
9008 +
9009 +       result = toi_bio_ops.bdev_page_io(WRITE, toi_file_target_bdev,
9010 +                       target_firstblock,
9011 +                       virt_to_page(sig_page));
9012 +
9013 +out:
9014 +       toi_bio_ops.finish_all_io();
9015 +       toi_free_page(38, sig_page);
9016 +
9017 +       return result;
9018 +}
9019 +
9020 +/* HEADER READING */
9021 +
9022 +/*
9023 + * read_header_init()
9024 + *
9025 + * Description:
9026 + * 1. Attempt to read the device specified with resume=.
9027 + * 2. Check the contents of the header for our signature.
9028 + * 3. Warn, ignore, reset and/or continue as appropriate.
9029 + * 4. If continuing, read the toi_file configuration section
9030 + *    of the header and set up block device info so we can read
9031 + *    the rest of the header & image.
9032 + *
9033 + * Returns:
9034 + * May not return if user choose to reboot at a warning.
9035 + * -EINVAL if cannot resume at this time. Booting should continue
9036 + * normally.
9037 + */
9038 +
9039 +static int toi_file_read_header_init(void)
9040 +{
9041 +       int result;
9042 +       struct block_device *tmp;
9043 +
9044 +       toi_bio_ops.read_header_init();
9045 +
9046 +       /* Read toi_file configuration */
9047 +       result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
9048 +                       target_header_start,
9049 +                       virt_to_page((unsigned long) toi_writer_buffer));
9050 +
9051 +       if (result) {
9052 +               printk("FileAllocator read header init: Failed to initialise "
9053 +                               "reading the first page of data.\n");
9054 +               toi_bio_ops.rw_cleanup(READ);
9055 +               return result;
9056 +       }
9057 +
9058 +       memcpy(&toi_writer_posn_save, toi_writer_buffer,
9059 +              sizeof(toi_writer_posn_save));
9060 +
9061 +       toi_writer_buffer_posn = sizeof(toi_writer_posn_save);
9062 +
9063 +       tmp = devinfo.bdev;
9064 +
9065 +       memcpy(&devinfo,
9066 +              toi_writer_buffer + toi_writer_buffer_posn,
9067 +              sizeof(devinfo));
9068 +
9069 +       devinfo.bdev = tmp;
9070 +       toi_writer_buffer_posn += sizeof(devinfo);
9071 +
9072 +       toi_extent_state_goto_start(&toi_writer_posn);
9073 +       toi_bio_ops.set_extra_page_forward();
9074 +
9075 +       return toi_load_extent_chain(&block_chain);
9076 +}
9077 +
9078 +static int toi_file_read_header_cleanup(void)
9079 +{
9080 +       toi_bio_ops.rw_cleanup(READ);
9081 +       return 0;
9082 +}
9083 +
9084 +static int toi_file_signature_op(int op)
9085 +{
9086 +       char *cur;
9087 +       int result = 0, changed = 0;
9088 +       struct toi_file_header *header;
9089 +
9090 +       if (toi_file_target_bdev <= 0)
9091 +               return -1;
9092 +
9093 +       cur = (char *) toi_get_zeroed_page(17, TOI_ATOMIC_GFP);
9094 +       if (!cur) {
9095 +               printk("Unable to allocate a page for reading the image "
9096 +                               "signature.\n");
9097 +               return -ENOMEM;
9098 +       }
9099 +
9100 +       result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
9101 +                       target_firstblock,
9102 +                       virt_to_page(cur));
9103 +
9104 +       if (result)
9105 +               goto out;
9106 +
9107 +       header = (struct toi_file_header *) cur;
9108 +       result = parse_signature(header);
9109 +
9110 +       switch (op) {
9111 +       case INVALIDATE:
9112 +               if (result == -1)
9113 +                       goto out;
9114 +
9115 +               memcpy(header->sig, tuxonice_signature,
9116 +                               sizeof(tuxonice_signature));
9117 +               header->resumed_before = 0;
9118 +               header->have_image = 0;
9119 +               result = changed = 1;
9120 +               break;
9121 +       case MARK_RESUME_ATTEMPTED:
9122 +               if (result == 1) {
9123 +                       header->resumed_before = 1;
9124 +                       changed = 1;
9125 +               }
9126 +               break;
9127 +       case UNMARK_RESUME_ATTEMPTED:
9128 +               if (result == 1) {
9129 +                       header->resumed_before = 0;
9130 +                       changed = 1;
9131 +               }
9132 +               break;
9133 +       }
9134 +
9135 +       if (changed) {
9136 +               int io_result = toi_bio_ops.bdev_page_io(WRITE,
9137 +                               toi_file_target_bdev, target_firstblock,
9138 +                               virt_to_page(cur));
9139 +               if (io_result)
9140 +                       result = io_result;
9141 +       }
9142 +
9143 +out:
9144 +       toi_bio_ops.finish_all_io();
9145 +       toi_free_page(17, (unsigned long) cur);
9146 +       return result;
9147 +}
9148 +
9149 +/* Print debug info
9150 + *
9151 + * Description:
9152 + */
9153 +
9154 +static int toi_file_print_debug_stats(char *buffer, int size)
9155 +{
9156 +       int len = 0;
9157 +
9158 +       if (toiActiveAllocator != &toi_fileops) {
9159 +               len = snprintf_used(buffer, size,
9160 +                               "- FileAllocator inactive.\n");
9161 +               return len;
9162 +       }
9163 +
9164 +       len = snprintf_used(buffer, size, "- FileAllocator active.\n");
9165 +
9166 +       len += snprintf_used(buffer+len, size-len, "  Storage available for "
9167 +                       "image: %ld pages.\n",
9168 +                       toi_file_storage_allocated());
9169 +
9170 +       return len;
9171 +}
9172 +
9173 +/*
9174 + * Storage needed
9175 + *
9176 + * Returns amount of space in the image header required
9177 + * for the toi_file's data.
9178 + *
9179 + * We ensure the space is allocated, but actually save the
9180 + * data from write_header_init and therefore don't also define a
9181 + * save_config_info routine.
9182 + */
9183 +static int toi_file_storage_needed(void)
9184 +{
9185 +       return sig_size + strlen(toi_file_target) + 1 +
9186 +               sizeof(toi_writer_posn_save) +
9187 +               sizeof(devinfo) +
9188 +               sizeof(struct hibernate_extent_chain) - 2 * sizeof(void *) +
9189 +               (2 * sizeof(unsigned long) * block_chain.num_extents);
9190 +}
9191 +
9192 +/*
9193 + * toi_file_remove_image
9194 + *
9195 + */
9196 +static int toi_file_remove_image(void)
9197 +{
9198 +       toi_file_release_storage();
9199 +       return toi_file_signature_op(INVALIDATE);
9200 +}
9201 +
9202 +/*
9203 + * Image_exists
9204 + *
9205 + */
9206 +
9207 +static int toi_file_image_exists(int quiet)
9208 +{
9209 +       if (!toi_file_target_bdev)
9210 +               reopen_resume_devt();
9211 +
9212 +       return toi_file_signature_op(GET_IMAGE_EXISTS);
9213 +}
9214 +
9215 +/*
9216 + * Mark resume attempted.
9217 + *
9218 + * Record that we tried to resume from this image.
9219 + */
9220 +
9221 +static int toi_file_mark_resume_attempted(int mark)
9222 +{
9223 +       return toi_file_signature_op(mark ? MARK_RESUME_ATTEMPTED:
9224 +               UNMARK_RESUME_ATTEMPTED);
9225 +}
9226 +
9227 +static void toi_file_set_resume_param(void)
9228 +{
9229 +       char *buffer = (char *) toi_get_zeroed_page(18, TOI_ATOMIC_GFP);
9230 +       char *buffer2 = (char *) toi_get_zeroed_page(19, TOI_ATOMIC_GFP);
9231 +       unsigned long sector = bmap(target_inode, 0);
9232 +       int offset = 0;
9233 +
9234 +       if (!buffer || !buffer2) {
9235 +               if (buffer)
9236 +                       toi_free_page(18, (unsigned long) buffer);
9237 +               if (buffer2)
9238 +                       toi_free_page(19, (unsigned long) buffer2);
9239 +               printk("TuxOnIce: Failed to allocate memory while setting "
9240 +                               "resume= parameter.\n");
9241 +               return;
9242 +       }
9243 +
9244 +       if (toi_file_target_bdev) {
9245 +               set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
9246 +
9247 +               bdevname(toi_file_target_bdev, buffer2);
9248 +               offset += snprintf(buffer + offset, PAGE_SIZE - offset,
9249 +                               "/dev/%s", buffer2);
9250 +
9251 +               if (sector)
9252 +                       offset += snprintf(buffer + offset, PAGE_SIZE - offset,
9253 +                               ":0x%lx", sector << devinfo.bmap_shift);
9254 +       } else
9255 +               offset += snprintf(buffer + offset, PAGE_SIZE - offset,
9256 +                               "%s is not a valid target.", toi_file_target);
9257 +
9258 +       sprintf(resume_file, "file:%s", buffer);
9259 +
9260 +       toi_free_page(18, (unsigned long) buffer);
9261 +       toi_free_page(19, (unsigned long) buffer2);
9262 +
9263 +       toi_attempt_to_parse_resume_device(1);
9264 +}
9265 +
9266 +static int __test_toi_file_target(char *target, int resume_time, int quiet)
9267 +{
9268 +       toi_file_get_target_info(target, 0, resume_time);
9269 +       if (toi_file_signature_op(GET_IMAGE_EXISTS) > -1) {
9270 +               if (!quiet)
9271 +                       printk(KERN_INFO "TuxOnIce: FileAllocator: File "
9272 +                                       "signature found.\n");
9273 +               if (!resume_time)
9274 +                       toi_file_set_resume_param();
9275 +
9276 +               toi_bio_ops.set_devinfo(&devinfo);
9277 +               toi_writer_posn.chains = &block_chain;
9278 +               toi_writer_posn.num_chains = 1;
9279 +
9280 +               if (!resume_time)
9281 +                       set_toi_state(TOI_CAN_HIBERNATE);
9282 +               return 0;
9283 +       }
9284 +
9285 +       clear_toi_state(TOI_CAN_HIBERNATE);
9286 +
9287 +       if (quiet)
9288 +               return 1;
9289 +
9290 +       if (*target)
9291 +               printk(KERN_INFO "TuxOnIce: FileAllocator: Sorry. No signature "
9292 +                               "found at  %s.\n", target);
9293 +       else
9294 +               if (!resume_time)
9295 +                       printk(KERN_INFO "TuxOnIce: FileAllocator: Sorry. "
9296 +                                       "Target is not set for hibernating.\n");
9297 +
9298 +       return 1;
9299 +}
9300 +
9301 +static void test_toi_file_target(void)
9302 +{
9303 +       setting_toi_file_target = 1;
9304 +
9305 +       printk(KERN_INFO "TuxOnIce: Hibernating %sabled.\n",
9306 +                       __test_toi_file_target(toi_file_target, 0, 1) ?
9307 +                       "dis" : "en");
9308 +
9309 +       setting_toi_file_target = 0;
9310 +}
9311 +
9312 +/*
9313 + * Parse Image Location
9314 + *
9315 + * Attempt to parse a resume= parameter.
9316 + * File Allocator accepts:
9317 + * resume=file:DEVNAME[:FIRSTBLOCK]
9318 + *
9319 + * Where:
9320 + * DEVNAME is convertable to a dev_t by name_to_dev_t
9321 + * FIRSTBLOCK is the location of the first block in the file.
9322 + * BLOCKSIZE is the logical blocksize >= SECTOR_SIZE & <= PAGE_SIZE,
9323 + * mod SECTOR_SIZE == 0 of the device.
9324 + * Data is validated by attempting to read a header from the
9325 + * location given. Failure will result in toi_file refusing to
9326 + * save an image, and a reboot with correct parameters will be
9327 + * necessary.
9328 + */
9329 +
9330 +static int toi_file_parse_sig_location(char *commandline,
9331 +               int only_writer, int quiet)
9332 +{
9333 +       char *thischar, *devstart = NULL, *colon = NULL, *at_symbol = NULL;
9334 +       int result = -EINVAL, target_blocksize = 0;
9335 +
9336 +       if (strncmp(commandline, "file:", 5)) {
9337 +               if (!only_writer)
9338 +                       return 1;
9339 +       } else
9340 +               commandline += 5;
9341 +
9342 +       /*
9343 +        * Don't check signature again if we're beginning a cycle. If we already
9344 +        * did the initialisation successfully, assume we'll be okay when it
9345 +        * comes to resuming.
9346 +        */
9347 +       if (toi_file_target_bdev)
9348 +               return 0;
9349 +
9350 +       devstart = thischar = commandline;
9351 +       while ((*thischar != ':') && (*thischar != '@') &&
9352 +               ((thischar - commandline) < 250) && (*thischar))
9353 +               thischar++;
9354 +
9355 +       if (*thischar == ':') {
9356 +               colon = thischar;
9357 +               *colon = 0;
9358 +               thischar++;
9359 +       }
9360 +
9361 +       while ((*thischar != '@') && ((thischar - commandline) < 250)
9362 +                       && (*thischar))
9363 +               thischar++;
9364 +
9365 +       if (*thischar == '@') {
9366 +               at_symbol = thischar;
9367 +               *at_symbol = 0;
9368 +       }
9369 +
9370 +       /*
9371 +        * For the toi_file, you can be able to resume, but not hibernate,
9372 +        * because the resume= is set correctly, but the toi_file_target
9373 +        * isn't.
9374 +        *
9375 +        * We may have come here as a result of setting resume or
9376 +        * toi_file_target. We only test the toi_file target in the
9377 +        * former case (it's already done in the later), and we do it before
9378 +        * setting the block number ourselves. It will overwrite the values
9379 +        * given on the command line if we don't.
9380 +        */
9381 +
9382 +       if (!setting_toi_file_target)
9383 +               __test_toi_file_target(toi_file_target, 1, 0);
9384 +
9385 +       if (colon)
9386 +               target_firstblock = (int) simple_strtoul(colon + 1, NULL, 0);
9387 +       else
9388 +               target_firstblock = 0;
9389 +
9390 +       if (at_symbol) {
9391 +               target_blocksize = (int) simple_strtoul(at_symbol + 1, NULL, 0);
9392 +               if (target_blocksize & (SECTOR_SIZE - 1)) {
9393 +                       printk(KERN_INFO "FileAllocator: Blocksizes are "
9394 +                                       "multiples of %d.\n", SECTOR_SIZE);
9395 +                       result = -EINVAL;
9396 +                       goto out;
9397 +               }
9398 +       }
9399 +
9400 +       if (!quiet)
9401 +               printk(KERN_INFO "TuxOnIce FileAllocator: Testing whether you"
9402 +                               " can resume:\n");
9403 +
9404 +       toi_file_get_target_info(commandline, 0, 1);
9405 +
9406 +       if (!toi_file_target_bdev || IS_ERR(toi_file_target_bdev)) {
9407 +               toi_file_target_bdev = NULL;
9408 +               result = -1;
9409 +               goto out;
9410 +       }
9411 +
9412 +       if (target_blocksize)
9413 +               set_devinfo(toi_file_target_bdev, ffs(target_blocksize));
9414 +
9415 +       result = __test_toi_file_target(commandline, 1, 0);
9416 +
9417 +out:
9418 +       if (result)
9419 +               clear_toi_state(TOI_CAN_HIBERNATE);
9420 +
9421 +       if (!quiet)
9422 +               printk(KERN_INFO "Resuming %sabled.\n",  result ? "dis" : "en");
9423 +
9424 +       if (colon)
9425 +               *colon = ':';
9426 +       if (at_symbol)
9427 +               *at_symbol = '@';
9428 +
9429 +       return result;
9430 +}
9431 +
9432 +/* toi_file_save_config_info
9433 + *
9434 + * Description:        Save the target's name, not for resume time, but for
9435 + *             all_settings.
9436 + * Arguments:  Buffer:         Pointer to a buffer of size PAGE_SIZE.
9437 + * Returns:    Number of bytes used for saving our data.
9438 + */
9439 +
9440 +static int toi_file_save_config_info(char *buffer)
9441 +{
9442 +       strcpy(buffer, toi_file_target);
9443 +       return strlen(toi_file_target) + 1;
9444 +}
9445 +
9446 +/* toi_file_load_config_info
9447 + *
9448 + * Description:        Reload target's name.
9449 + * Arguments:  Buffer:         Pointer to the start of the data.
9450 + *             Size:           Number of bytes that were saved.
9451 + */
9452 +
9453 +static void toi_file_load_config_info(char *buffer, int size)
9454 +{
9455 +       strcpy(toi_file_target, buffer);
9456 +}
9457 +
9458 +static int toi_file_initialise(int starting_cycle)
9459 +{
9460 +       if (starting_cycle) {
9461 +               if (toiActiveAllocator != &toi_fileops)
9462 +                       return 0;
9463 +
9464 +               if (starting_cycle & SYSFS_HIBERNATE && !*toi_file_target) {
9465 +                       printk(KERN_INFO "FileAllocator is the active writer,  "
9466 +                                       "but no filename has been set.\n");
9467 +                       return 1;
9468 +               }
9469 +       }
9470 +
9471 +       if (*toi_file_target)
9472 +               toi_file_get_target_info(toi_file_target, starting_cycle, 0);
9473 +
9474 +       if (starting_cycle && (toi_file_image_exists(1) == -1)) {
9475 +               printk("%s is does not have a valid signature for "
9476 +                               "hibernating.\n", toi_file_target);
9477 +               return 1;
9478 +       }
9479 +
9480 +       return 0;
9481 +}
9482 +
9483 +static struct toi_sysfs_data sysfs_params[] = {
9484 +
9485 +       {
9486 +        TOI_ATTR("target", SYSFS_RW),
9487 +        SYSFS_STRING(toi_file_target, 256, SYSFS_NEEDS_SM_FOR_WRITE),
9488 +        .write_side_effect             = test_toi_file_target,
9489 +       },
9490 +
9491 +       {
9492 +         TOI_ATTR("enabled", SYSFS_RW),
9493 +         SYSFS_INT(&toi_fileops.enabled, 0, 1, 0),
9494 +         .write_side_effect            = attempt_to_parse_resume_device2,
9495 +       }
9496 +};
9497 +
9498 +static struct toi_module_ops toi_fileops = {
9499 +       .type                                   = WRITER_MODULE,
9500 +       .name                                   = "file storage",
9501 +       .directory                              = "file",
9502 +       .module                                 = THIS_MODULE,
9503 +       .print_debug_info                       = toi_file_print_debug_stats,
9504 +       .save_config_info                       = toi_file_save_config_info,
9505 +       .load_config_info                       = toi_file_load_config_info,
9506 +       .storage_needed                         = toi_file_storage_needed,
9507 +       .initialise                             = toi_file_initialise,
9508 +       .cleanup                                = toi_file_cleanup,
9509 +
9510 +       .noresume_reset         = toi_file_noresume_reset,
9511 +       .storage_available      = toi_file_storage_available,
9512 +       .storage_allocated      = toi_file_storage_allocated,
9513 +       .release_storage        = toi_file_release_storage,
9514 +       .reserve_header_space   = toi_file_reserve_header_space,
9515 +       .allocate_storage       = toi_file_allocate_storage,
9516 +       .image_exists           = toi_file_image_exists,
9517 +       .mark_resume_attempted  = toi_file_mark_resume_attempted,
9518 +       .write_header_init      = toi_file_write_header_init,
9519 +       .write_header_cleanup   = toi_file_write_header_cleanup,
9520 +       .read_header_init       = toi_file_read_header_init,
9521 +       .read_header_cleanup    = toi_file_read_header_cleanup,
9522 +       .remove_image           = toi_file_remove_image,
9523 +       .parse_sig_location     = toi_file_parse_sig_location,
9524 +
9525 +       .sysfs_data             = sysfs_params,
9526 +       .num_sysfs_entries      = sizeof(sysfs_params) /
9527 +               sizeof(struct toi_sysfs_data),
9528 +};
9529 +
9530 +/* ---- Registration ---- */
9531 +static __init int toi_file_load(void)
9532 +{
9533 +       toi_fileops.rw_init = toi_bio_ops.rw_init;
9534 +       toi_fileops.rw_cleanup = toi_bio_ops.rw_cleanup;
9535 +       toi_fileops.read_page = toi_bio_ops.read_page;
9536 +       toi_fileops.write_page = toi_bio_ops.write_page;
9537 +       toi_fileops.rw_header_chunk = toi_bio_ops.rw_header_chunk;
9538 +       toi_fileops.rw_header_chunk_noreadahead =
9539 +               toi_bio_ops.rw_header_chunk_noreadahead;
9540 +       toi_fileops.io_flusher = toi_bio_ops.io_flusher;
9541 +
9542 +       return toi_register_module(&toi_fileops);
9543 +}
9544 +
9545 +#ifdef MODULE
9546 +static __exit void toi_file_unload(void)
9547 +{
9548 +       toi_unregister_module(&toi_fileops);
9549 +}
9550 +
9551 +module_init(toi_file_load);
9552 +module_exit(toi_file_unload);
9553 +MODULE_LICENSE("GPL");
9554 +MODULE_AUTHOR("Nigel Cunningham");
9555 +MODULE_DESCRIPTION("TuxOnIce FileAllocator");
9556 +#else
9557 +late_initcall(toi_file_load);
9558 +#endif
9559 diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
9560 new file mode 100644
9561 index 0000000..f233f24
9562 --- /dev/null
9563 +++ b/kernel/power/tuxonice_highlevel.c
9564 @@ -0,0 +1,1328 @@
9565 +/*
9566 + * kernel/power/tuxonice_highlevel.c
9567 + */
9568 +/** \mainpage TuxOnIce.
9569 + *
9570 + * TuxOnIce provides support for saving and restoring an image of
9571 + * system memory to an arbitrary storage device, either on the local computer,
9572 + * or across some network. The support is entirely OS based, so TuxOnIce
9573 + * works without requiring BIOS, APM or ACPI support. The vast majority of the
9574 + * code is also architecture independant, so it should be very easy to port
9575 + * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
9576 + * and preemption. Initramfses and initrds are also supported.
9577 + *
9578 + * TuxOnIce uses a modular design, in which the method of storing the image is
9579 + * completely abstracted from the core code, as are transformations on the data
9580 + * such as compression and/or encryption (multiple 'modules' can be used to
9581 + * provide arbitrary combinations of functionality). The user interface is also
9582 + * modular, so that arbitrarily simple or complex interfaces can be used to
9583 + * provide anything from debugging information through to eye candy.
9584 + *
9585 + * \section Copyright
9586 + *
9587 + * TuxOnIce is released under the GPLv2.
9588 + *
9589 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
9590 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
9591 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
9592 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)<BR>
9593 + *
9594 + * \section Credits
9595 + *
9596 + * Nigel would like to thank the following people for their work:
9597 + *
9598 + * Bernard Blackham <bernard@blackham.com.au><BR>
9599 + * Web page & Wiki administration, some coding. A person without whom
9600 + * TuxOnIce would not be where it is.
9601 + *
9602 + * Michael Frank <mhf@linuxmail.org><BR>
9603 + * Extensive testing and help with improving stability. I was constantly
9604 + * amazed by the quality and quantity of Michael's help.
9605 + *
9606 + * Pavel Machek <pavel@ucw.cz><BR>
9607 + * Modifications, defectiveness pointing, being with Gabor at the very
9608 + * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
9609 + * 2.5.17. Even though Pavel and I disagree on the direction suspend to
9610 + * disk should take, I appreciate the valuable work he did in helping Gabor
9611 + * get the concept working.
9612 + *
9613 + * ..and of course the myriads of TuxOnIce users who have helped diagnose
9614 + * and fix bugs, made suggestions on how to improve the code, proofread
9615 + * documentation, and donated time and money.
9616 + *
9617 + * Thanks also to corporate sponsors:
9618 + *
9619 + * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
9620 + *
9621 + * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
9622 + * allowed him to work on TuxOnIce and PM related issues on company time.
9623 + *
9624 + * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
9625 + * 2003 to Jan 2004.
9626 + *
9627 + * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
9628 + * maintenance of SMP and Highmem support.
9629 + *
9630 + * <B>OSDL.</B> Provided access to various hardware configurations, make
9631 + * occasional small donations to the project.
9632 + */
9633 +
9634 +#include <linux/suspend.h>
9635 +#include <linux/module.h>
9636 +#include <linux/freezer.h>
9637 +#include <linux/utsrelease.h>
9638 +#include <linux/cpu.h>
9639 +#include <linux/console.h>
9640 +#include <linux/writeback.h>
9641 +#include <asm/uaccess.h>
9642 +
9643 +#include "tuxonice_modules.h"
9644 +#include "tuxonice_sysfs.h"
9645 +#include "tuxonice_prepare_image.h"
9646 +#include "tuxonice_io.h"
9647 +#include "tuxonice_ui.h"
9648 +#include "tuxonice_power_off.h"
9649 +#include "tuxonice_storage.h"
9650 +#include "tuxonice_checksum.h"
9651 +#include "tuxonice_cluster.h"
9652 +#include "tuxonice_builtin.h"
9653 +#include "tuxonice_atomic_copy.h"
9654 +#include "tuxonice_alloc.h"
9655 +
9656 +/*! Pageset metadata. */
9657 +struct pagedir pagedir2 = {2};
9658 +
9659 +static int get_pmsem = 0, got_pmsem;
9660 +static mm_segment_t oldfs;
9661 +static DEFINE_MUTEX(tuxonice_in_use);
9662 +static int block_dump_save;
9663 +static char pre_hibernate_command[256];
9664 +static char post_hibernate_command[256];
9665 +
9666 +char *tuxonice_signature = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
9667 +
9668 +int toi_fail_num;
9669 +
9670 +int do_toi_step(int step);
9671 +
9672 +unsigned long boot_kernel_data_buffer;
9673 +
9674 +/**
9675 + * toi_finish_anything - Cleanup after doing anything.
9676 + *
9677 + * @toi_or_resume: Whether finishing a cycle or attempt at resuming.
9678 + *
9679 + * This is our basic clean-up routine, matching start_anything below. We
9680 + * call cleanup routines, drop module references and restore process fs and
9681 + * cpus allowed masks, together with the global block_dump variable's value.
9682 + */
9683 +void toi_finish_anything(int hibernate_or_resume)
9684 +{
9685 +       toi_cleanup_modules(hibernate_or_resume);
9686 +       toi_put_modules();
9687 +       if (hibernate_or_resume) {
9688 +               block_dump = block_dump_save;
9689 +               set_cpus_allowed(current, CPU_MASK_ALL);
9690 +               toi_alloc_print_debug_stats();
9691 +
9692 +               if (hibernate_or_resume == SYSFS_HIBERNATE &&
9693 +                               strlen(post_hibernate_command))
9694 +                       toi_launch_userspace_program(post_hibernate_command,
9695 +                                       0, UMH_WAIT_PROC);
9696 +       }
9697 +
9698 +       set_fs(oldfs);
9699 +       mutex_unlock(&tuxonice_in_use);
9700 +}
9701 +
9702 +/**
9703 + * toi_start_anything - Basic initialisation for TuxOnIce.
9704 + *
9705 + * @toi_or_resume: Whether starting a cycle or attempt at resuming.
9706 + *
9707 + * Our basic initialisation routine. Take references on modules, use the
9708 + * kernel segment, recheck resume= if no active allocator is set, initialise
9709 + * modules, save and reset block_dump and ensure we're running on CPU0.
9710 + */
9711 +int toi_start_anything(int hibernate_or_resume)
9712 +{
9713 +       mutex_lock(&tuxonice_in_use);
9714 +
9715 +       oldfs = get_fs();
9716 +       set_fs(KERNEL_DS);
9717 +
9718 +       if (hibernate_or_resume == SYSFS_HIBERNATE &&
9719 +                       strlen(pre_hibernate_command)) {
9720 +               int result = toi_launch_userspace_program(pre_hibernate_command,
9721 +                               0, UMH_WAIT_PROC);
9722 +               if (result) {
9723 +                       printk(KERN_INFO "Pre-hibernate command '%s' returned "
9724 +                                       "%d. Aborting.\n",
9725 +                                       pre_hibernate_command, result);
9726 +                       goto prehibernate_err;
9727 +               }
9728 +       }
9729 +
9730 +       if (hibernate_or_resume == SYSFS_HIBERNATE)
9731 +               toi_print_modules();
9732 +
9733 +       if (toi_get_modules()) {
9734 +               printk("TuxOnIce: Get modules failed!\n");
9735 +               goto getmodules_err;
9736 +       }
9737 +
9738 +       if (hibernate_or_resume) {
9739 +               block_dump_save = block_dump;
9740 +               block_dump = 0;
9741 +               set_cpus_allowed(current,
9742 +                               cpumask_of_cpu(first_cpu(cpu_online_map)));
9743 +       }
9744 +
9745 +       if (toi_initialise_modules_early(hibernate_or_resume))
9746 +               goto early_init_err;
9747 +
9748 +       if (!toiActiveAllocator)
9749 +               toi_attempt_to_parse_resume_device(!hibernate_or_resume);
9750 +
9751 +       if (toi_initialise_modules_late(hibernate_or_resume))
9752 +               goto late_init_err;
9753 +
9754 +       return 0;
9755 +
9756 +late_init_err:
9757 +       toi_cleanup_modules(hibernate_or_resume);
9758 +early_init_err:
9759 +       if (hibernate_or_resume) {
9760 +               block_dump_save = block_dump;
9761 +               set_cpus_allowed(current, CPU_MASK_ALL);
9762 +       }
9763 +getmodules_err:
9764 +prehibernate_err:
9765 +       set_fs(oldfs);
9766 +       mutex_unlock(&tuxonice_in_use);
9767 +       return -EBUSY;
9768 +}
9769 +
9770 +/*
9771 + * Nosave page tracking.
9772 + *
9773 + * Here rather than in prepare_image because we want to do it once only at the
9774 + * start of a cycle.
9775 + */
9776 +
9777 +/**
9778 + * mark_nosave_pages - Set up our Nosave bitmap.
9779 + *
9780 + * Build a bitmap of Nosave pages from the list. The bitmap allows faster
9781 + * use when preparing the image.
9782 + */
9783 +static void mark_nosave_pages(void)
9784 +{
9785 +       struct nosave_region *region;
9786 +
9787 +       list_for_each_entry(region, &nosave_regions, list) {
9788 +               unsigned long pfn;
9789 +
9790 +               for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
9791 +                       SetPageNosave(pfn_to_page(pfn));
9792 +       }
9793 +}
9794 +
9795 +/**
9796 + * allocate_bitmaps: Allocate bitmaps used to record page states.
9797 + *
9798 + * Allocate the bitmaps we use to record the various TuxOnIce related
9799 + * page states.
9800 + */
9801 +static int allocate_bitmaps(void)
9802 +{
9803 +       if (allocate_dyn_pageflags(&pageset1_map, 0) ||
9804 +           allocate_dyn_pageflags(&pageset1_copy_map, 0) ||
9805 +           allocate_dyn_pageflags(&pageset2_map, 0) ||
9806 +           allocate_dyn_pageflags(&io_map, 0) ||
9807 +           allocate_dyn_pageflags(&nosave_map, 0) ||
9808 +           allocate_dyn_pageflags(&free_map, 0) ||
9809 +           allocate_dyn_pageflags(&page_resave_map, 0))
9810 +               return 1;
9811 +
9812 +       return 0;
9813 +}
9814 +
9815 +/**
9816 + * free_bitmaps: Free the bitmaps used to record page states.
9817 + *
9818 + * Free the bitmaps allocated above. It is not an error to call
9819 + * free_dyn_pageflags on a bitmap that isn't currentyl allocated.
9820 + */
9821 +static void free_bitmaps(void)
9822 +{
9823 +       free_dyn_pageflags(&pageset1_map);
9824 +       free_dyn_pageflags(&pageset1_copy_map);
9825 +       free_dyn_pageflags(&pageset2_map);
9826 +       free_dyn_pageflags(&io_map);
9827 +       free_dyn_pageflags(&nosave_map);
9828 +       free_dyn_pageflags(&free_map);
9829 +       free_dyn_pageflags(&page_resave_map);
9830 +}
9831 +
9832 +/**
9833 + * io_MB_per_second: Return the number of MB/s read or written.
9834 + *
9835 + * @write: Whether to return the speed at which we wrote.
9836 + *
9837 + * Calculate the number of megabytes per second that were read or written.
9838 + */
9839 +static int io_MB_per_second(int write)
9840 +{
9841 +       return (toi_bkd.toi_io_time[write][1]) ?
9842 +               MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
9843 +               toi_bkd.toi_io_time[write][1] : 0;
9844 +}
9845 +
9846 +/**
9847 + * get_debug_info: Fill a buffer with debugging information.
9848 + *
9849 + * @buffer: The buffer to be filled.
9850 + * @count: The size of the buffer, in bytes.
9851 + *
9852 + * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
9853 + * either printk or return via sysfs.
9854 + */
9855 +#define SNPRINTF(a...)         do { len += snprintf_used(((char *) buffer) + len, \
9856 +               count - len - 1, ## a); } while (0)
9857 +
9858 +static int get_toi_debug_info(const char *buffer, int count)
9859 +{
9860 +       int len = 0;
9861 +
9862 +       SNPRINTF("TuxOnIce debugging info:\n");
9863 +       SNPRINTF("- TuxOnIce core  : " TOI_CORE_VERSION "\n");
9864 +       SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
9865 +       SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
9866 +       SNPRINTF("- Attempt number : %d\n", nr_hibernates);
9867 +       SNPRINTF("- Parameters     : %ld %ld %ld %d %d %ld\n",
9868 +                       toi_result,
9869 +                       toi_bkd.toi_action,
9870 +                       toi_bkd.toi_debug_state,
9871 +                       toi_bkd.toi_default_console_level,
9872 +                       image_size_limit,
9873 +                       toi_poweroff_method);
9874 +       SNPRINTF("- Overall expected compression percentage: %d.\n",
9875 +                       100 - toi_expected_compression_ratio());
9876 +       len += toi_print_module_debug_info(((char *) buffer) + len,
9877 +                       count - len - 1);
9878 +       if (toi_bkd.toi_io_time[0][1]) {
9879 +               if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
9880 +                       SNPRINTF("- I/O speed: Write %d KB/s",
9881 +                         (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
9882 +                         toi_bkd.toi_io_time[0][1]));
9883 +                       if (toi_bkd.toi_io_time[1][1])
9884 +                               SNPRINTF(", Read %d KB/s",
9885 +                                 (KB((unsigned long)
9886 +                                     toi_bkd.toi_io_time[1][0]) * HZ /
9887 +                                 toi_bkd.toi_io_time[1][1]));
9888 +               } else {
9889 +                       SNPRINTF("- I/O speed: Write %d MB/s",
9890 +                        (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
9891 +                         toi_bkd.toi_io_time[0][1]));
9892 +                       if (toi_bkd.toi_io_time[1][1])
9893 +                               SNPRINTF(", Read %d MB/s",
9894 +                                (MB((unsigned long)
9895 +                                    toi_bkd.toi_io_time[1][0]) * HZ /
9896 +                                 toi_bkd.toi_io_time[1][1]));
9897 +               }
9898 +               SNPRINTF(".\n");
9899 +       } else
9900 +               SNPRINTF("- No I/O speed stats available.\n");
9901 +       SNPRINTF("- Extra pages    : %ld used/%ld.\n",
9902 +                       extra_pd1_pages_used, extra_pd1_pages_allowance);
9903 +
9904 +       return len;
9905 +}
9906 +
9907 +/**
9908 + * do_cleanup: Cleanup after attempting to hibernate or resume.
9909 + *
9910 + * @get_debug_info: Whether to allocate and return debugging info.
9911 + *
9912 + * Cleanup after attempting to hibernate or resume, possibly getting
9913 + * debugging info as we do so.
9914 + */
9915 +static void do_cleanup(int get_debug_info)
9916 +{
9917 +       int i = 0;
9918 +       char *buffer = NULL;
9919 +
9920 +       if (get_debug_info)
9921 +               toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
9922 +       relink_lru_lists();
9923 +
9924 +       free_checksum_pages();
9925 +
9926 +       if (get_debug_info)
9927 +               buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
9928 +
9929 +       if (buffer)
9930 +               i = get_toi_debug_info(buffer, PAGE_SIZE);
9931 +
9932 +       toi_free_extra_pagedir_memory();
9933 +
9934 +       pagedir1.size = pagedir2.size = 0;
9935 +       set_highmem_size(pagedir1, 0);
9936 +       set_highmem_size(pagedir2, 0);
9937 +
9938 +       if (boot_kernel_data_buffer) {
9939 +               toi_free_page(37, boot_kernel_data_buffer);
9940 +               boot_kernel_data_buffer = 0;
9941 +       }
9942 +
9943 +       if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
9944 +               pm_notifier_call_chain(PM_POST_HIBERNATION);
9945 +               clear_toi_state(TOI_NOTIFIERS_PREPARE);
9946 +       }
9947 +
9948 +       thaw_processes();
9949 +
9950 +#ifdef CONFIG_TOI_KEEP_IMAGE
9951 +       if (test_action_state(TOI_KEEP_IMAGE) &&
9952 +           !test_result_state(TOI_ABORTED)) {
9953 +               toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
9954 +                       "TuxOnIce: Not invalidating the image due "
9955 +                       "to Keep Image being enabled.\n");
9956 +               set_result_state(TOI_KEPT_IMAGE);
9957 +       } else
9958 +#endif
9959 +               if (toiActiveAllocator)
9960 +                       toiActiveAllocator->remove_image();
9961 +
9962 +       free_bitmaps();
9963 +
9964 +       if (buffer && i) {
9965 +               /* Printk can only handle 1023 bytes, including
9966 +                * its level mangling. */
9967 +               for (i = 0; i < 3; i++)
9968 +                       printk("%s", buffer + (1023 * i));
9969 +               toi_free_page(20, (unsigned long) buffer);
9970 +       }
9971 +
9972 +       if (!test_action_state(TOI_LATE_CPU_HOTPLUG))
9973 +               enable_nonboot_cpus();
9974 +       toi_cleanup_console();
9975 +
9976 +       free_attention_list();
9977 +
9978 +       toi_deactivate_storage(0);
9979 +
9980 +       clear_toi_state(TOI_IGNORE_LOGLEVEL);
9981 +       clear_toi_state(TOI_TRYING_TO_RESUME);
9982 +       clear_toi_state(TOI_NOW_RESUMING);
9983 +
9984 +       if (got_pmsem) {
9985 +               mutex_unlock(&pm_mutex);
9986 +               got_pmsem = 0;
9987 +       }
9988 +}
9989 +
9990 +/**
9991 + * check_still_keeping_image: We kept an image; check whether to reuse it.
9992 + *
9993 + * We enter this routine when we have kept an image. If the user has said they
9994 + * want to still keep it, all we need to do is powerdown. If powering down
9995 + * means hibernating to ram and the power doesn't run out, we'll return 1.
9996 + * If we do power off properly or the battery runs out, we'll resume via the
9997 + * normal paths.
9998 + *
9999 + * If the user has said they want to remove the previously kept image, we
10000 + * remove it, and return 0. We'll then store a new image.
10001 + */
10002 +static int check_still_keeping_image(void)
10003 +{
10004 +       if (test_action_state(TOI_KEEP_IMAGE)) {
10005 +               printk("Image already stored: powering down immediately.");
10006 +               do_toi_step(STEP_HIBERNATE_POWERDOWN);
10007 +               return 1;       /* Just in case we're using S3 */
10008 +       }
10009 +
10010 +       printk("Invalidating previous image.\n");
10011 +       toiActiveAllocator->remove_image();
10012 +
10013 +       return 0;
10014 +}
10015 +
10016 +/**
10017 + * toi_init: Prepare to hibernate to disk.
10018 + *
10019 + * Initialise variables & data structures, in preparation for
10020 + * hibernating to disk.
10021 + */
10022 +static int toi_init(void)
10023 +{
10024 +       int result, i, j;
10025 +
10026 +       toi_result = 0;
10027 +
10028 +       printk(KERN_INFO "Initiating a hibernation cycle.\n");
10029 +
10030 +       nr_hibernates++;
10031 +
10032 +       for (i = 0; i < 2; i++)
10033 +               for (j = 0; j < 2; j++)
10034 +                       toi_bkd.toi_io_time[i][j] = 0;
10035 +
10036 +       if (!test_toi_state(TOI_CAN_HIBERNATE) ||
10037 +           allocate_bitmaps())
10038 +               return 1;
10039 +
10040 +       mark_nosave_pages();
10041 +
10042 +       toi_prepare_console();
10043 +
10044 +       result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
10045 +       if (result) {
10046 +               set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
10047 +               return 1;
10048 +       }
10049 +       set_toi_state(TOI_NOTIFIERS_PREPARE);
10050 +
10051 +       boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
10052 +       if (!boot_kernel_data_buffer) {
10053 +               printk(KERN_ERR "TuxOnIce: Failed to allocate "
10054 +                               "boot_kernel_data_buffer.\n");
10055 +               set_result_state(TOI_OUT_OF_MEMORY);
10056 +               return 1;
10057 +       }
10058 +
10059 +       if (test_action_state(TOI_LATE_CPU_HOTPLUG) ||
10060 +                       !disable_nonboot_cpus())
10061 +               return 1;
10062 +
10063 +       set_abort_result(TOI_CPU_HOTPLUG_FAILED);
10064 +       return 0;
10065 +}
10066 +
10067 +/**
10068 + * can_hibernate: Perform basic 'Can we hibernate?' tests.
10069 + *
10070 + * Perform basic tests that must pass if we're going to be able to hibernate:
10071 + * Can we get the pm_mutex? Is resume= valid (we need to know where to write
10072 + * the image header).
10073 + */
10074 +static int can_hibernate(void)
10075 +{
10076 +       if (get_pmsem) {
10077 +               if (!mutex_trylock(&pm_mutex)) {
10078 +                       printk(KERN_INFO "TuxOnIce: Failed to obtain "
10079 +                                       "pm_mutex.\n");
10080 +                       dump_stack();
10081 +                       set_abort_result(TOI_PM_SEM);
10082 +                       return 0;
10083 +               }
10084 +               got_pmsem = 1;
10085 +       }
10086 +
10087 +       if (!test_toi_state(TOI_CAN_HIBERNATE))
10088 +               toi_attempt_to_parse_resume_device(0);
10089 +
10090 +       if (!test_toi_state(TOI_CAN_HIBERNATE)) {
10091 +               printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
10092 +                       "This may be because you haven't put something along "
10093 +                       "the lines of\n\nresume=swap:/dev/hda1\n\n"
10094 +                       "in lilo.conf or equivalent. (Where /dev/hda1 is your "
10095 +                       "swap partition).\n");
10096 +               set_abort_result(TOI_CANT_SUSPEND);
10097 +               if (!got_pmsem) {
10098 +                       mutex_unlock(&pm_mutex);
10099 +                       got_pmsem = 0;
10100 +               }
10101 +               return 0;
10102 +       }
10103 +
10104 +       return 1;
10105 +}
10106 +
10107 +/**
10108 + * do_post_image_write: Having written an image, figure out what to do next.
10109 + *
10110 + * After writing an image, we might load an alternate image or power down.
10111 + * Powering down might involve hibernating to ram, in which case we also
10112 + * need to handle reloading pageset2.
10113 + */
10114 +static int do_post_image_write(void)
10115 +{
10116 +       /* If switching images fails, do normal powerdown */
10117 +       if (alt_resume_param[0])
10118 +               do_toi_step(STEP_RESUME_ALT_IMAGE);
10119 +
10120 +       toi_power_down();
10121 +
10122 +       /* If we return, it's because we hibernated to ram */
10123 +       if (read_pageset2(1))
10124 +               panic("Attempt to reload pagedir 2 failed. Try rebooting.");
10125 +
10126 +       barrier();
10127 +       mb();
10128 +       do_cleanup(1);
10129 +       return 0;
10130 +}
10131 +
10132 +/**
10133 + * __save_image: Do the hard work of saving the image.
10134 + *
10135 + * High level routine for getting the image saved. The key assumptions made
10136 + * are that processes have been frozen and sufficient memory is available.
10137 + *
10138 + * We also exit through here at resume time, coming back from toi_hibernate
10139 + * after the atomic restore. This is the reason for the toi_in_hibernate
10140 + * test.
10141 + */
10142 +static int __save_image(void)
10143 +{
10144 +       int temp_result, did_copy = 0;
10145 +
10146 +       toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
10147 +
10148 +       toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
10149 +               " - Final values: %d and %d.\n",
10150 +               pagedir1.size, pagedir2.size);
10151 +
10152 +       toi_cond_pause(1, "About to write pagedir2.");
10153 +
10154 +       temp_result = write_pageset(&pagedir2);
10155 +
10156 +       if (temp_result == -1 || test_result_state(TOI_ABORTED))
10157 +               return 1;
10158 +
10159 +       toi_cond_pause(1, "About to copy pageset 1.");
10160 +
10161 +       if (test_result_state(TOI_ABORTED))
10162 +               return 1;
10163 +
10164 +       toi_deactivate_storage(1);
10165 +
10166 +       toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy.");
10167 +
10168 +       toi_in_hibernate = 1;
10169 +
10170 +       if (toi_go_atomic(PMSG_FREEZE, 1))
10171 +               goto Failed;
10172 +
10173 +       temp_result = toi_hibernate();
10174 +       if (!temp_result)
10175 +               did_copy = 1;
10176 +
10177 +       /* We return here at resume time too! */
10178 +       toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
10179 +
10180 +Failed:
10181 +       if (toi_activate_storage(1))
10182 +               panic("Failed to reactivate our storage.");
10183 +
10184 +       /* Resume time? */
10185 +       if (!toi_in_hibernate) {
10186 +               copyback_post();
10187 +               return 0;
10188 +       }
10189 +
10190 +       /* Nope. Hibernating. So, see if we can save the image... */
10191 +
10192 +       if (temp_result || test_result_state(TOI_ABORTED)) {
10193 +               if (did_copy)
10194 +                       goto abort_reloading_pagedir_two;
10195 +               else
10196 +                       return 1;
10197 +       }
10198 +
10199 +       toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
10200 +                       NULL);
10201 +
10202 +       if (test_result_state(TOI_ABORTED))
10203 +               goto abort_reloading_pagedir_two;
10204 +
10205 +       toi_cond_pause(1, "About to write pageset1.");
10206 +
10207 +       toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1\n");
10208 +
10209 +       temp_result = write_pageset(&pagedir1);
10210 +
10211 +       /* We didn't overwrite any memory, so no reread needs to be done. */
10212 +       if (test_action_state(TOI_TEST_FILTER_SPEED))
10213 +               return 1;
10214 +
10215 +       if (temp_result == 1 || test_result_state(TOI_ABORTED))
10216 +               goto abort_reloading_pagedir_two;
10217 +
10218 +       toi_cond_pause(1, "About to write header.");
10219 +
10220 +       if (test_result_state(TOI_ABORTED))
10221 +               goto abort_reloading_pagedir_two;
10222 +
10223 +       temp_result = write_image_header();
10224 +
10225 +       if (test_action_state(TOI_TEST_BIO))
10226 +               return 1;
10227 +
10228 +       if (!temp_result && !test_result_state(TOI_ABORTED))
10229 +               return 0;
10230 +
10231 +abort_reloading_pagedir_two:
10232 +       temp_result = read_pageset2(1);
10233 +
10234 +       /* If that failed, we're sunk. Panic! */
10235 +       if (temp_result)
10236 +               panic("Attempt to reload pagedir 2 while aborting "
10237 +                               "a hibernate failed.");
10238 +
10239 +       return 1;
10240 +}
10241 +
10242 +/**
10243 + * do_save_image: Save the image and handle the result.
10244 + *
10245 + * Save the prepared image. If we fail or we're in the path returning
10246 + * from the atomic restore, cleanup.
10247 + */
10248 +
10249 +static int do_save_image(void)
10250 +{
10251 +       int result = __save_image();
10252 +       if (!toi_in_hibernate || result)
10253 +               do_cleanup(1);
10254 +       return result;
10255 +}
10256 +
10257 +
10258 +/**
10259 + * do_prepare_image: Try to prepare an image.
10260 + *
10261 + * Seek to initialise and prepare an image to be saved. On failure,
10262 + * cleanup.
10263 + */
10264 +
10265 +static int do_prepare_image(void)
10266 +{
10267 +       if (toi_activate_storage(0))
10268 +               return 1;
10269 +
10270 +       /*
10271 +        * If kept image and still keeping image and hibernating to RAM, we will
10272 +        * return 1 after hibernating and resuming (provided the power doesn't
10273 +        * run out. In that case, we skip directly to cleaning up and exiting.
10274 +        */
10275 +
10276 +       if (!can_hibernate() ||
10277 +           (test_result_state(TOI_KEPT_IMAGE) &&
10278 +            check_still_keeping_image()))
10279 +               goto cleanup;
10280 +
10281 +       if (toi_init() && !toi_prepare_image() &&
10282 +                       !test_result_state(TOI_ABORTED))
10283 +               return 0;
10284 +
10285 +cleanup:
10286 +       do_cleanup(0);
10287 +       return 1;
10288 +}
10289 +
10290 +/**
10291 + * do_check_can_resume: Find out whether an image has been stored.
10292 + *
10293 + * Read whether an image exists. We use the same routine as the
10294 + * image_exists sysfs entry, and just look to see whether the
10295 + * first character in the resulting buffer is a '1'.
10296 + */
10297 +int do_check_can_resume(void)
10298 +{
10299 +       char *buf = (char *) toi_get_zeroed_page(21, TOI_ATOMIC_GFP);
10300 +       int result = 0;
10301 +
10302 +       if (!buf)
10303 +               return 0;
10304 +
10305 +       /* Only interested in first byte, so throw away return code. */
10306 +       image_exists_read(buf, PAGE_SIZE);
10307 +
10308 +       if (buf[0] == '1')
10309 +               result = 1;
10310 +
10311 +       toi_free_page(21, (unsigned long) buf);
10312 +       return result;
10313 +}
10314 +
10315 +/**
10316 + * do_load_atomic_copy: Load the first part of an image, if it exists.
10317 + *
10318 + * Check whether we have an image. If one exists, do sanity checking
10319 + * (possibly invalidating the image or even rebooting if the user
10320 + * requests that) before loading it into memory in preparation for the
10321 + * atomic restore.
10322 + *
10323 + * If and only if we have an image loaded and ready to restore, we return 1.
10324 + */
10325 +static int do_load_atomic_copy(void)
10326 +{
10327 +       int read_image_result = 0;
10328 +
10329 +       if (sizeof(swp_entry_t) != sizeof(long)) {
10330 +               printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
10331 +                       " of long. Please report this!\n");
10332 +               return 1;
10333 +       }
10334 +
10335 +       if (!resume_file[0])
10336 +               printk(KERN_WARNING "TuxOnIce: "
10337 +                       "You need to use a resume= command line parameter to "
10338 +                       "tell TuxOnIce where to look for an image.\n");
10339 +
10340 +       toi_activate_storage(0);
10341 +
10342 +       if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
10343 +               !toi_attempt_to_parse_resume_device(0)) {
10344 +               /*
10345 +                * Without a usable storage device we can do nothing -
10346 +                * even if noresume is given
10347 +                */
10348 +
10349 +               if (!toiNumAllocators)
10350 +                       printk(KERN_ALERT "TuxOnIce: "
10351 +                         "No storage allocators have been registered.\n");
10352 +               else
10353 +                       printk(KERN_ALERT "TuxOnIce: "
10354 +                               "Missing or invalid storage location "
10355 +                               "(resume= parameter). Please correct and "
10356 +                               "rerun lilo (or equivalent) before "
10357 +                               "hibernating.\n");
10358 +               toi_deactivate_storage(0);
10359 +               return 1;
10360 +       }
10361 +
10362 +       read_image_result = read_pageset1(); /* non fatal error ignored */
10363 +
10364 +       if (test_toi_state(TOI_NORESUME_SPECIFIED))
10365 +               clear_toi_state(TOI_NORESUME_SPECIFIED);
10366 +
10367 +       toi_deactivate_storage(0);
10368 +
10369 +       if (read_image_result)
10370 +               return 1;
10371 +
10372 +       return 0;
10373 +}
10374 +
10375 +/**
10376 + * prepare_restore_load_alt_image: Save & restore alt image variables.
10377 + *
10378 + * Save and restore the pageset1 maps, when loading an alternate image.
10379 + */
10380 +static void prepare_restore_load_alt_image(int prepare)
10381 +{
10382 +       static struct dyn_pageflags pageset1_map_save, pageset1_copy_map_save;
10383 +
10384 +       if (prepare) {
10385 +               memcpy(&pageset1_map_save, &pageset1_map,
10386 +                               sizeof(struct dyn_pageflags));
10387 +               pageset1_map.bitmap = NULL;
10388 +               pageset1_map.sparse = 0;
10389 +               pageset1_map.initialised = 0;
10390 +               memcpy(&pageset1_copy_map_save, &pageset1_copy_map,
10391 +                       sizeof(struct dyn_pageflags));
10392 +               pageset1_copy_map.bitmap = NULL;
10393 +               pageset1_copy_map.sparse = 0;
10394 +               pageset1_copy_map.initialised = 0;
10395 +               set_toi_state(TOI_LOADING_ALT_IMAGE);
10396 +               toi_reset_alt_image_pageset2_pfn();
10397 +       } else {
10398 +               if (pageset1_map.bitmap)
10399 +                       free_dyn_pageflags(&pageset1_map);
10400 +               memcpy(&pageset1_map, &pageset1_map_save,
10401 +                       sizeof(struct dyn_pageflags));
10402 +               if (pageset1_copy_map.bitmap)
10403 +                       free_dyn_pageflags(&pageset1_copy_map);
10404 +               memcpy(&pageset1_copy_map, &pageset1_copy_map_save,
10405 +                       sizeof(struct dyn_pageflags));
10406 +               clear_toi_state(TOI_NOW_RESUMING);
10407 +               clear_toi_state(TOI_LOADING_ALT_IMAGE);
10408 +       }
10409 +}
10410 +
10411 +/**
10412 + * pre_resume_freeze: Freeze the system, before doing an atomic restore.
10413 + *
10414 + * Hot unplug cpus (if we didn't do it early) and freeze processes, in
10415 + * preparation for doing an atomic restore.
10416 + */
10417 +int pre_resume_freeze(void)
10418 +{
10419 +       if (!test_action_state(TOI_LATE_CPU_HOTPLUG)) {
10420 +               toi_prepare_status(DONT_CLEAR_BAR, "Disable nonboot cpus.");
10421 +               if (disable_nonboot_cpus()) {
10422 +                       set_abort_result(TOI_CPU_HOTPLUG_FAILED);
10423 +                       return 1;
10424 +               }
10425 +       }
10426 +
10427 +       toi_prepare_status(DONT_CLEAR_BAR,      "Freeze processes.");
10428 +
10429 +       if (freeze_processes()) {
10430 +               printk("Some processes failed to stop.\n");
10431 +               return 1;
10432 +       }
10433 +
10434 +       return 0;
10435 +}
10436 +
10437 +/**
10438 + * do_toi_step: Perform a step in hibernating or resuming.
10439 + *
10440 + * Perform a step in hibernating or resuming an image. This abstraction
10441 + * is in preparation for implementing cluster support, and perhaps replacing
10442 + * uswsusp too (haven't looked whether that's possible yet).
10443 + */
10444 +int do_toi_step(int step)
10445 +{
10446 +       switch (step) {
10447 +       case STEP_HIBERNATE_PREPARE_IMAGE:
10448 +               return do_prepare_image();
10449 +       case STEP_HIBERNATE_SAVE_IMAGE:
10450 +               return do_save_image();
10451 +       case STEP_HIBERNATE_POWERDOWN:
10452 +               return do_post_image_write();
10453 +       case STEP_RESUME_CAN_RESUME:
10454 +               return do_check_can_resume();
10455 +       case STEP_RESUME_LOAD_PS1:
10456 +               return do_load_atomic_copy();
10457 +       case STEP_RESUME_DO_RESTORE:
10458 +               /*
10459 +                * If we succeed, this doesn't return.
10460 +                * Instead, we return from do_save_image() in the
10461 +                * hibernated kernel.
10462 +                */
10463 +               return toi_atomic_restore();
10464 +       case STEP_RESUME_ALT_IMAGE:
10465 +               printk(KERN_INFO "Trying to resume alternate image.\n");
10466 +               toi_in_hibernate = 0;
10467 +               save_restore_alt_param(SAVE, NOQUIET);
10468 +               prepare_restore_load_alt_image(1);
10469 +               if (!do_check_can_resume()) {
10470 +                       printk(KERN_INFO "Nothing to resume from.\n");
10471 +                       goto out;
10472 +               }
10473 +               if (!do_load_atomic_copy())
10474 +                       toi_atomic_restore();
10475 +
10476 +               printk(KERN_INFO "Failed to load image.\n");
10477 +out:
10478 +               prepare_restore_load_alt_image(0);
10479 +               save_restore_alt_param(RESTORE, NOQUIET);
10480 +               break;
10481 +       case STEP_CLEANUP:
10482 +               do_cleanup(1);
10483 +               break;
10484 +       case STEP_QUIET_CLEANUP:
10485 +               do_cleanup(0);
10486 +               break;
10487 +       }
10488 +
10489 +       return 0;
10490 +}
10491 +EXPORT_SYMBOL_GPL(do_toi_step);
10492 +
10493 +/* -- Functions for kickstarting a hibernate or resume --- */
10494 +
10495 +/**
10496 + * __toi_try_resume: Try to do the steps in resuming.
10497 + *
10498 + * Check if we have an image and if so try to resume. Clear the status
10499 + * flags too.
10500 + */
10501 +void __toi_try_resume(void)
10502 +{
10503 +       set_toi_state(TOI_TRYING_TO_RESUME);
10504 +       resume_attempted = 1;
10505 +
10506 +       current->flags |= PF_MEMALLOC;
10507 +
10508 +       if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
10509 +           !do_toi_step(STEP_RESUME_LOAD_PS1))
10510 +           do_toi_step(STEP_RESUME_DO_RESTORE);
10511 +
10512 +       do_cleanup(0);
10513 +
10514 +       current->flags &= ~PF_MEMALLOC;
10515 +
10516 +       clear_toi_state(TOI_IGNORE_LOGLEVEL);
10517 +       clear_toi_state(TOI_TRYING_TO_RESUME);
10518 +       clear_toi_state(TOI_NOW_RESUMING);
10519 +}
10520 +
10521 +/**
10522 + * _toi_try_resume: Wrapper calling __toi_try_resume from do_mounts.
10523 + *
10524 + * Wrapper for when __toi_try_resume is called from init/do_mounts.c,
10525 + * rather than from echo > /sys/power/tuxonice/do_resume.
10526 + */
10527 +void _toi_try_resume(void)
10528 +{
10529 +       resume_attempted = 1;
10530 +
10531 +       /*
10532 +        * There's a comment in kernel/power/disk.c that indicates
10533 +        * we should be able to use mutex_lock_nested below. That
10534 +        * doesn't seem to cut it, though, so let's just turn lockdep
10535 +        * off for now.
10536 +        */
10537 +       lockdep_off();
10538 +
10539 +       if (toi_start_anything(SYSFS_RESUMING))
10540 +               goto out;
10541 +
10542 +       /* Unlock will be done in do_cleanup */
10543 +       mutex_lock(&pm_mutex);
10544 +       got_pmsem = 1;
10545 +
10546 +       __toi_try_resume();
10547 +
10548 +       /*
10549 +        * For initramfs, we have to clear the boot time
10550 +        * flag after trying to resume
10551 +        */
10552 +       clear_toi_state(TOI_BOOT_TIME);
10553 +
10554 +out:
10555 +       toi_finish_anything(SYSFS_RESUMING);
10556 +       lockdep_on();
10557 +
10558 +}
10559 +
10560 +/**
10561 + * _toi_try_hibernate: Try to start a hibernation cycle.
10562 + *
10563 + * have_pmsem: Whther the pm_sem is already taken.
10564 + *
10565 + * Start a hibernation cycle, coming in from either
10566 + * echo > /sys/power/tuxonice/do_suspend
10567 + *
10568 + * or
10569 + *
10570 + * echo disk > /sys/power/state
10571 + *
10572 + * In the later case, we come in without pm_sem taken; in the
10573 + * former, it has been taken.
10574 + */
10575 +int _toi_try_hibernate(int have_pmsem)
10576 +{
10577 +       int result = 0, sys_power_disk = 0;
10578 +
10579 +       if (!mutex_is_locked(&tuxonice_in_use)) {
10580 +               /* Came in via /sys/power/disk */
10581 +               if (toi_start_anything(SYSFS_HIBERNATING))
10582 +                       return -EBUSY;
10583 +               sys_power_disk = 1;
10584 +       }
10585 +
10586 +       get_pmsem = !have_pmsem;
10587 +
10588 +       if (strlen(alt_resume_param)) {
10589 +               attempt_to_parse_alt_resume_param();
10590 +
10591 +               if (!strlen(alt_resume_param)) {
10592 +                       printk(KERN_INFO "Alternate resume parameter now "
10593 +                                       "invalid. Aborting.\n");
10594 +                       goto out;
10595 +               }
10596 +       }
10597 +
10598 +       current->flags |= PF_MEMALLOC;
10599 +
10600 +       if (test_toi_state(TOI_CLUSTER_MODE)) {
10601 +               toi_initiate_cluster_hibernate();
10602 +               goto out;
10603 +       }
10604 +
10605 +       result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
10606 +       if (result)
10607 +               goto out;
10608 +
10609 +       if (test_action_state(TOI_FREEZER_TEST)) {
10610 +               do_cleanup(0);
10611 +               goto out;
10612 +       }
10613 +
10614 +       result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
10615 +       if (result)
10616 +               goto out;
10617 +
10618 +       /* This code runs at resume time too! */
10619 +       if (toi_in_hibernate)
10620 +               result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
10621 +out:
10622 +       current->flags &= ~PF_MEMALLOC;
10623 +
10624 +       if (sys_power_disk)
10625 +               toi_finish_anything(SYSFS_HIBERNATING);
10626 +
10627 +       return result;
10628 +}
10629 +
10630 +/*
10631 + * channel_no: If !0, -c <channel_no> is added to args (userui).
10632 + */
10633 +int toi_launch_userspace_program(char *command, int channel_no,
10634 +               enum umh_wait wait)
10635 +{
10636 +       int retval;
10637 +       static char *envp[] = {
10638 +                       "HOME=/",
10639 +                       "TERM=linux",
10640 +                       "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
10641 +                       NULL };
10642 +       static char *argv[] =
10643 +               { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
10644 +       char *channel = NULL;
10645 +       int arg = 0, size;
10646 +       char test_read[255];
10647 +       char *orig_posn = command;
10648 +
10649 +       if (!strlen(orig_posn))
10650 +               return 1;
10651 +
10652 +       if (channel_no) {
10653 +               channel = toi_kzalloc(4, 6, GFP_KERNEL);
10654 +               if (!channel) {
10655 +                       printk(KERN_INFO "Failed to allocate memory in "
10656 +                               "preparing to launch userspace program.\n");
10657 +                       return 1;
10658 +               }
10659 +       }
10660 +
10661 +       /* Up to 7 args supported */
10662 +       while (arg < 7) {
10663 +               sscanf(orig_posn, "%s", test_read);
10664 +               size = strlen(test_read);
10665 +               if (!(size))
10666 +                       break;
10667 +               argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
10668 +               strcpy(argv[arg], test_read);
10669 +               orig_posn += size + 1;
10670 +               *test_read = 0;
10671 +               arg++;
10672 +       }
10673 +
10674 +       if (channel_no) {
10675 +               sprintf(channel, "-c%d", channel_no);
10676 +               argv[arg] = channel;
10677 +       } else
10678 +               arg--;
10679 +
10680 +       retval = call_usermodehelper(argv[0], argv, envp, wait);
10681 +
10682 +       /*
10683 +        * If the program reports an error, retval = 256. Don't complain
10684 +        * about that here.
10685 +        */
10686 +       if (retval && retval != 256)
10687 +               printk("Failed to launch userspace program '%s': Error %d\n",
10688 +                               command, retval);
10689 +
10690 +       {
10691 +               int i;
10692 +               for (i = 0; i < arg; i++)
10693 +                       if (argv[i] && argv[i] != channel)
10694 +                               toi_kfree(5, argv[i]);
10695 +       }
10696 +
10697 +       toi_kfree(4, channel);
10698 +
10699 +       return retval;
10700 +}
10701 +
10702 +/*
10703 + * This array contains entries that are automatically registered at
10704 + * boot. Modules and the console code register their own entries separately.
10705 + */
10706 +static struct toi_sysfs_data sysfs_params[] = {
10707 +       { TOI_ATTR("extra_pages_allowance", SYSFS_RW),
10708 +         SYSFS_LONG(&extra_pd1_pages_allowance, 0,
10709 +                       LONG_MAX, 0)
10710 +       },
10711 +
10712 +       { TOI_ATTR("image_exists", SYSFS_RW),
10713 +         SYSFS_CUSTOM(image_exists_read, image_exists_write,
10714 +                         SYSFS_NEEDS_SM_FOR_BOTH)
10715 +       },
10716 +
10717 +       { TOI_ATTR("resume", SYSFS_RW),
10718 +         SYSFS_STRING(resume_file, 255, SYSFS_NEEDS_SM_FOR_WRITE),
10719 +         .write_side_effect = attempt_to_parse_resume_device2,
10720 +       },
10721 +
10722 +       { TOI_ATTR("alt_resume_param", SYSFS_RW),
10723 +         SYSFS_STRING(alt_resume_param, 255, SYSFS_NEEDS_SM_FOR_WRITE),
10724 +         .write_side_effect = attempt_to_parse_alt_resume_param,
10725 +       },
10726 +       { TOI_ATTR("debug_info", SYSFS_READONLY),
10727 +         SYSFS_CUSTOM(get_toi_debug_info, NULL, 0)
10728 +       },
10729 +
10730 +       { TOI_ATTR("ignore_rootfs", SYSFS_RW),
10731 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_IGNORE_ROOTFS, 0)
10732 +       },
10733 +
10734 +       { TOI_ATTR("image_size_limit", SYSFS_RW),
10735 +         SYSFS_INT(&image_size_limit, -2, INT_MAX, 0)
10736 +       },
10737 +
10738 +       { TOI_ATTR("last_result", SYSFS_RW),
10739 +         SYSFS_UL(&toi_result, 0, 0, 0)
10740 +       },
10741 +
10742 +       { TOI_ATTR("no_multithreaded_io", SYSFS_RW),
10743 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_MULTITHREADED_IO, 0)
10744 +       },
10745 +
10746 +       { TOI_ATTR("no_flusher_thread", SYSFS_RW),
10747 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_FLUSHER_THREAD, 0)
10748 +       },
10749 +
10750 +       { TOI_ATTR("full_pageset2", SYSFS_RW),
10751 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_PAGESET2_FULL, 0)
10752 +       },
10753 +
10754 +       { TOI_ATTR("reboot", SYSFS_RW),
10755 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_REBOOT, 0)
10756 +       },
10757 +
10758 +       { TOI_ATTR("replace_swsusp", SYSFS_RW),
10759 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_REPLACE_SWSUSP, 0)
10760 +       },
10761 +
10762 +       { TOI_ATTR("resume_commandline", SYSFS_RW),
10763 +         SYSFS_STRING(toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0)
10764 +       },
10765 +
10766 +       { TOI_ATTR("version", SYSFS_READONLY),
10767 +         SYSFS_STRING(TOI_CORE_VERSION, 0, 0)
10768 +       },
10769 +
10770 +       { TOI_ATTR("no_load_direct", SYSFS_RW),
10771 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_DIRECT_LOAD, 0)
10772 +       },
10773 +
10774 +       { TOI_ATTR("freezer_test", SYSFS_RW),
10775 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_FREEZER_TEST, 0)
10776 +       },
10777 +
10778 +       { TOI_ATTR("test_bio", SYSFS_RW),
10779 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_TEST_BIO, 0)
10780 +       },
10781 +
10782 +       { TOI_ATTR("test_filter_speed", SYSFS_RW),
10783 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_TEST_FILTER_SPEED, 0)
10784 +       },
10785 +
10786 +       { TOI_ATTR("no_pageset2", SYSFS_RW),
10787 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_NO_PAGESET2, 0)
10788 +       },
10789 +
10790 +       { TOI_ATTR("late_cpu_hotplug", SYSFS_RW),
10791 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_LATE_CPU_HOTPLUG, 0)
10792 +       },
10793 +
10794 +       { TOI_ATTR("pre_hibernate_command", SYSFS_RW),
10795 +         SYSFS_STRING(pre_hibernate_command, 0, 255)
10796 +       },
10797 +
10798 +       { TOI_ATTR("post_hibernate_command", SYSFS_RW),
10799 +         SYSFS_STRING(post_hibernate_command, 0, 255)
10800 +       },
10801 +
10802 +#ifdef CONFIG_TOI_KEEP_IMAGE
10803 +       { TOI_ATTR("keep_image", SYSFS_RW),
10804 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_KEEP_IMAGE, 0)
10805 +       },
10806 +#endif
10807 +};
10808 +
10809 +struct toi_core_fns my_fns = {
10810 +       .get_nonconflicting_page = __toi_get_nonconflicting_page,
10811 +       .post_context_save = __toi_post_context_save,
10812 +       .try_hibernate = _toi_try_hibernate,
10813 +       .try_resume = _toi_try_resume,
10814 +};
10815 +
10816 +/**
10817 + * core_load: Initialisation of TuxOnIce core.
10818 + *
10819 + * Initialise the core, beginning with sysfs. Checksum and so on are part of
10820 + * the core, but have their own initialisation routines because they either
10821 + * aren't compiled in all the time or have their own subdirectories.
10822 + */
10823 +static __init int core_load(void)
10824 +{
10825 +       int i,
10826 +           numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
10827 +
10828 +       printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
10829 +                       " (http://tuxonice.net)\n");
10830 +       strncpy(pre_hibernate_command, CONFIG_TOI_DEFAULT_PRE_HIBERNATE, 255);
10831 +       strncpy(post_hibernate_command, CONFIG_TOI_DEFAULT_POST_HIBERNATE, 255);
10832 +
10833 +       if (toi_sysfs_init())
10834 +               return 1;
10835 +
10836 +       for (i = 0; i < numfiles; i++)
10837 +               toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
10838 +
10839 +       toi_core_fns = &my_fns;
10840 +
10841 +       if (toi_alloc_init())
10842 +               return 1;
10843 +       if (toi_checksum_init())
10844 +               return 1;
10845 +       if (toi_cluster_init())
10846 +               return 1;
10847 +       if (toi_usm_init())
10848 +               return 1;
10849 +       if (toi_ui_init())
10850 +               return 1;
10851 +       if (toi_poweroff_init())
10852 +               return 1;
10853 +
10854 +       return 0;
10855 +}
10856 +
10857 +#ifdef MODULE
10858 +/**
10859 + * core_unload: Prepare to unload the core code.
10860 + */
10861 +static __exit void core_unload(void)
10862 +{
10863 +       int i,
10864 +           numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
10865 +
10866 +       toi_alloc_exit();
10867 +       toi_poweroff_exit();
10868 +       toi_ui_exit();
10869 +       toi_checksum_exit();
10870 +       toi_cluster_exit();
10871 +       toi_usm_exit();
10872 +
10873 +       for (i = 0; i < numfiles; i++)
10874 +               toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
10875 +
10876 +       toi_core_fns = NULL;
10877 +
10878 +       toi_sysfs_exit();
10879 +}
10880 +MODULE_LICENSE("GPL");
10881 +module_init(core_load);
10882 +module_exit(core_unload);
10883 +#else
10884 +late_initcall(core_load);
10885 +#endif
10886 +
10887 +#ifdef CONFIG_TOI_EXPORTS
10888 +EXPORT_SYMBOL_GPL(tuxonice_signature);
10889 +EXPORT_SYMBOL_GPL(pagedir2);
10890 +EXPORT_SYMBOL_GPL(toi_fail_num);
10891 +EXPORT_SYMBOL_GPL(do_check_can_resume);
10892 +#endif
10893 diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
10894 new file mode 100644
10895 index 0000000..556ec22
10896 --- /dev/null
10897 +++ b/kernel/power/tuxonice_io.c
10898 @@ -0,0 +1,1427 @@
10899 +/*
10900 + * kernel/power/tuxonice_io.c
10901 + *
10902 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
10903 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
10904 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
10905 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
10906 + *
10907 + * This file is released under the GPLv2.
10908 + *
10909 + * It contains high level IO routines for hibernating.
10910 + *
10911 + */
10912 +
10913 +#include <linux/suspend.h>
10914 +#include <linux/version.h>
10915 +#include <linux/utsname.h>
10916 +#include <linux/mount.h>
10917 +#include <linux/highmem.h>
10918 +#include <linux/module.h>
10919 +#include <linux/kthread.h>
10920 +#include <linux/dyn_pageflags.h>
10921 +#include <asm/tlbflush.h>
10922 +
10923 +#include "tuxonice.h"
10924 +#include "tuxonice_modules.h"
10925 +#include "tuxonice_pageflags.h"
10926 +#include "tuxonice_io.h"
10927 +#include "tuxonice_ui.h"
10928 +#include "tuxonice_storage.h"
10929 +#include "tuxonice_prepare_image.h"
10930 +#include "tuxonice_extent.h"
10931 +#include "tuxonice_sysfs.h"
10932 +#include "tuxonice_builtin.h"
10933 +#include "tuxonice_checksum.h"
10934 +#include "tuxonice_alloc.h"
10935 +char alt_resume_param[256];
10936 +
10937 +/* Variables shared between threads and updated under the mutex */
10938 +static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
10939 +static int io_index, io_nextupdate, io_pc, io_pc_step;
10940 +static unsigned long pfn, other_pfn;
10941 +static DEFINE_MUTEX(io_mutex);
10942 +static DEFINE_PER_CPU(struct page *, last_sought);
10943 +static DEFINE_PER_CPU(struct page *, last_high_page);
10944 +static DEFINE_PER_CPU(char *, checksum_locn);
10945 +static DEFINE_PER_CPU(struct pbe *, last_low_page);
10946 +static atomic_t io_count;
10947 +atomic_t toi_io_workers;
10948 +DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
10949 +int toi_bio_queue_flusher_should_finish;
10950 +
10951 +/* toi_attempt_to_parse_resume_device
10952 + *
10953 + * Can we hibernate, using the current resume= parameter?
10954 + */
10955 +int toi_attempt_to_parse_resume_device(int quiet)
10956 +{
10957 +       struct list_head *Allocator;
10958 +       struct toi_module_ops *thisAllocator;
10959 +       int result, returning = 0;
10960 +
10961 +       if (toi_activate_storage(0))
10962 +               return 0;
10963 +
10964 +       toiActiveAllocator = NULL;
10965 +       clear_toi_state(TOI_RESUME_DEVICE_OK);
10966 +       clear_toi_state(TOI_CAN_RESUME);
10967 +       clear_result_state(TOI_ABORTED);
10968 +
10969 +       if (!toiNumAllocators) {
10970 +               if (!quiet)
10971 +                       printk(KERN_INFO "TuxOnIce: No storage allocators have "
10972 +                               "been registered. Hibernating will be "
10973 +                               "disabled.\n");
10974 +               goto cleanup;
10975 +       }
10976 +
10977 +       if (!resume_file[0]) {
10978 +               if (!quiet)
10979 +                       printk("TuxOnIce: Resume= parameter is empty."
10980 +                               " Hibernating will be disabled.\n");
10981 +               goto cleanup;
10982 +       }
10983 +
10984 +       list_for_each(Allocator, &toiAllocators) {
10985 +               thisAllocator = list_entry(Allocator, struct toi_module_ops,
10986 +                                                               type_list);
10987 +
10988 +               /*
10989 +                * Not sure why you'd want to disable an allocator, but
10990 +                * we should honour the flag if we're providing it
10991 +                */
10992 +               if (!thisAllocator->enabled)
10993 +                       continue;
10994 +
10995 +               result = thisAllocator->parse_sig_location(
10996 +                               resume_file, (toiNumAllocators == 1),
10997 +                               quiet);
10998 +
10999 +               switch (result) {
11000 +               case -EINVAL:
11001 +                       /* For this allocator, but not a valid
11002 +                        * configuration. Error already printed. */
11003 +                       goto cleanup;
11004 +
11005 +               case 0:
11006 +                       /* For this allocator and valid. */
11007 +                       toiActiveAllocator = thisAllocator;
11008 +
11009 +                       set_toi_state(TOI_RESUME_DEVICE_OK);
11010 +                       set_toi_state(TOI_CAN_RESUME);
11011 +                       returning = 1;
11012 +                       goto cleanup;
11013 +               }
11014 +       }
11015 +       if (!quiet)
11016 +               printk("TuxOnIce: No matching enabled allocator found. "
11017 +                               "Resuming disabled.\n");
11018 +cleanup:
11019 +       toi_deactivate_storage(0);
11020 +       return returning;
11021 +}
11022 +
11023 +void attempt_to_parse_resume_device2(void)
11024 +{
11025 +       toi_prepare_usm();
11026 +       toi_attempt_to_parse_resume_device(0);
11027 +       toi_cleanup_usm();
11028 +}
11029 +
11030 +void save_restore_alt_param(int replace, int quiet)
11031 +{
11032 +       static char resume_param_save[255];
11033 +       static unsigned long toi_state_save;
11034 +
11035 +       if (replace) {
11036 +               toi_state_save = toi_state;
11037 +               strcpy(resume_param_save, resume_file);
11038 +               strcpy(resume_file, alt_resume_param);
11039 +       } else {
11040 +               strcpy(resume_file, resume_param_save);
11041 +               toi_state = toi_state_save;
11042 +       }
11043 +       toi_attempt_to_parse_resume_device(quiet);
11044 +}
11045 +
11046 +void attempt_to_parse_alt_resume_param(void)
11047 +{
11048 +       int ok = 0;
11049 +
11050 +       /* Temporarily set resume_param to the poweroff value */
11051 +       if (!strlen(alt_resume_param))
11052 +               return;
11053 +
11054 +       printk("=== Trying Poweroff Resume2 ===\n");
11055 +       save_restore_alt_param(SAVE, NOQUIET);
11056 +       if (test_toi_state(TOI_CAN_RESUME))
11057 +               ok = 1;
11058 +
11059 +       printk(KERN_INFO "=== Done ===\n");
11060 +       save_restore_alt_param(RESTORE, QUIET);
11061 +
11062 +       /* If not ok, clear the string */
11063 +       if (ok)
11064 +               return;
11065 +
11066 +       printk(KERN_INFO "Can't resume from that location; clearing "
11067 +                       "alt_resume_param.\n");
11068 +       alt_resume_param[0] = '\0';
11069 +}
11070 +
11071 +/* noresume_reset_modules
11072 + *
11073 + * Description:        When we read the start of an image, modules (and especially the
11074 + *             active allocator) might need to reset data structures if we
11075 + *             decide to remove the image rather than resuming from it.
11076 + */
11077 +
11078 +static void noresume_reset_modules(void)
11079 +{
11080 +       struct toi_module_ops *this_filter;
11081 +
11082 +       list_for_each_entry(this_filter, &toi_filters, type_list)
11083 +               if (this_filter->noresume_reset)
11084 +                       this_filter->noresume_reset();
11085 +
11086 +       if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
11087 +               toiActiveAllocator->noresume_reset();
11088 +}
11089 +
11090 +/* fill_toi_header()
11091 + *
11092 + * Description:        Fill the hibernate header structure.
11093 + * Arguments:  struct toi_header: Header data structure to be filled.
11094 + */
11095 +
11096 +static int fill_toi_header(struct toi_header *sh)
11097 +{
11098 +       int i, error;
11099 +
11100 +       error = init_swsusp_header((struct swsusp_info *) sh);
11101 +       if (error)
11102 +               return error;
11103 +
11104 +       sh->pagedir = pagedir1;
11105 +       sh->pageset_2_size = pagedir2.size;
11106 +       sh->param0 = toi_result;
11107 +       sh->param1 = toi_bkd.toi_action;
11108 +       sh->param2 = toi_bkd.toi_debug_state;
11109 +       sh->param3 = toi_bkd.toi_default_console_level;
11110 +       sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
11111 +       for (i = 0; i < 4; i++)
11112 +               sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
11113 +       sh->bkd = boot_kernel_data_buffer;
11114 +       return 0;
11115 +}
11116 +
11117 +/*
11118 + * rw_init_modules
11119 + *
11120 + * Iterate over modules, preparing the ones that will be used to read or write
11121 + * data.
11122 + */
11123 +static int rw_init_modules(int rw, int which)
11124 +{
11125 +       struct toi_module_ops *this_module;
11126 +       /* Initialise page transformers */
11127 +       list_for_each_entry(this_module, &toi_filters, type_list) {
11128 +               if (!this_module->enabled)
11129 +                       continue;
11130 +               if (this_module->rw_init && this_module->rw_init(rw, which)) {
11131 +                       abort_hibernate(TOI_FAILED_MODULE_INIT,
11132 +                               "Failed to initialise the %s filter.",
11133 +                               this_module->name);
11134 +                       return 1;
11135 +               }
11136 +       }
11137 +
11138 +       /* Initialise allocator */
11139 +       if (toiActiveAllocator->rw_init(rw, which)) {
11140 +               abort_hibernate(TOI_FAILED_MODULE_INIT,
11141 +                               "Failed to initialise the allocator.");
11142 +               return 1;
11143 +       }
11144 +
11145 +       /* Initialise other modules */
11146 +       list_for_each_entry(this_module, &toi_modules, module_list) {
11147 +               if (!this_module->enabled ||
11148 +                   this_module->type == FILTER_MODULE ||
11149 +                   this_module->type == WRITER_MODULE)
11150 +                       continue;
11151 +               if (this_module->rw_init && this_module->rw_init(rw, which)) {
11152 +                       set_abort_result(TOI_FAILED_MODULE_INIT);
11153 +                       printk(KERN_INFO "Setting aborted flag due to module "
11154 +                                       "init failure.\n");
11155 +                       return 1;
11156 +               }
11157 +       }
11158 +
11159 +       return 0;
11160 +}
11161 +
11162 +/*
11163 + * rw_cleanup_modules
11164 + *
11165 + * Cleanup components after reading or writing a set of pages.
11166 + * Only the allocator may fail.
11167 + */
11168 +static int rw_cleanup_modules(int rw)
11169 +{
11170 +       struct toi_module_ops *this_module;
11171 +       int result = 0;
11172 +
11173 +       /* Cleanup other modules */
11174 +       list_for_each_entry(this_module, &toi_modules, module_list) {
11175 +               if (!this_module->enabled ||
11176 +                   this_module->type == FILTER_MODULE ||
11177 +                   this_module->type == WRITER_MODULE)
11178 +                       continue;
11179 +               if (this_module->rw_cleanup)
11180 +                       result |= this_module->rw_cleanup(rw);
11181 +       }
11182 +
11183 +       /* Flush data and cleanup */
11184 +       list_for_each_entry(this_module, &toi_filters, type_list) {
11185 +               if (!this_module->enabled)
11186 +                       continue;
11187 +               if (this_module->rw_cleanup)
11188 +                       result |= this_module->rw_cleanup(rw);
11189 +       }
11190 +
11191 +       result |= toiActiveAllocator->rw_cleanup(rw);
11192 +
11193 +       return result;
11194 +}
11195 +
11196 +static struct page *copy_page_from_orig_page(struct page *orig_page)
11197 +{
11198 +       int is_high = PageHighMem(orig_page), index, min, max;
11199 +       struct page *high_page = NULL,
11200 +                   **my_last_high_page = &__get_cpu_var(last_high_page),
11201 +                   **my_last_sought = &__get_cpu_var(last_sought);
11202 +       struct pbe *this, **my_last_low_page = &__get_cpu_var(last_low_page);
11203 +       void *compare;
11204 +
11205 +       if (is_high) {
11206 +               if (*my_last_sought && *my_last_high_page &&
11207 +                               *my_last_sought < orig_page)
11208 +                       high_page = *my_last_high_page;
11209 +               else
11210 +                       high_page = (struct page *) restore_highmem_pblist;
11211 +               this = (struct pbe *) kmap(high_page);
11212 +               compare = orig_page;
11213 +       } else {
11214 +               if (*my_last_sought && *my_last_low_page &&
11215 +                               *my_last_sought < orig_page)
11216 +                       this = *my_last_low_page;
11217 +               else
11218 +                       this = restore_pblist;
11219 +               compare = page_address(orig_page);
11220 +       }
11221 +
11222 +       *my_last_sought = orig_page;
11223 +
11224 +       /* Locate page containing pbe */
11225 +       while (this[PBES_PER_PAGE - 1].next &&
11226 +                       this[PBES_PER_PAGE - 1].orig_address < compare) {
11227 +               if (is_high) {
11228 +                       struct page *next_high_page = (struct page *)
11229 +                               this[PBES_PER_PAGE - 1].next;
11230 +                       kunmap(high_page);
11231 +                       this = kmap(next_high_page);
11232 +                       high_page = next_high_page;
11233 +               } else
11234 +                       this = this[PBES_PER_PAGE - 1].next;
11235 +       }
11236 +
11237 +       /* Do a binary search within the page */
11238 +       min = 0;
11239 +       max = PBES_PER_PAGE;
11240 +       index = PBES_PER_PAGE / 2;
11241 +       while (max - min) {
11242 +               if (!this[index].orig_address ||
11243 +                   this[index].orig_address > compare)
11244 +                       max = index;
11245 +               else if (this[index].orig_address == compare) {
11246 +                       if (is_high) {
11247 +                               struct page *page = this[index].address;
11248 +                               *my_last_high_page = high_page;
11249 +                               kunmap(high_page);
11250 +                               return page;
11251 +                       }
11252 +                       *my_last_low_page = this;
11253 +                       return virt_to_page(this[index].address);
11254 +               } else
11255 +                       min = index;
11256 +               index = ((max + min) / 2);
11257 +       };
11258 +
11259 +       if (is_high)
11260 +               kunmap(high_page);
11261 +
11262 +       abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
11263 +               " orig page %p. This[min].orig_address=%p.\n", orig_page,
11264 +               this[index].orig_address);
11265 +       return NULL;
11266 +}
11267 +
11268 +/*
11269 + * do_rw_loop
11270 + *
11271 + * The main I/O loop for reading or writing pages.
11272 + */
11273 +static int worker_rw_loop(void *data)
11274 +{
11275 +       unsigned long orig_pfn, write_pfn;
11276 +       int result, my_io_index = 0, temp;
11277 +       struct toi_module_ops *first_filter = toi_get_next_filter(NULL);
11278 +       struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
11279 +
11280 +       atomic_inc(&toi_io_workers);
11281 +       mutex_lock(&io_mutex);
11282 +
11283 +       do {
11284 +               int buf_size;
11285 +
11286 +               /*
11287 +                * What page to use? If reading, don't know yet which page's
11288 +                * data will be read, so always use the buffer. If writing,
11289 +                * use the copy (Pageset1) or original page (Pageset2), but
11290 +                * always write the pfn of the original page.
11291 +                */
11292 +               if (io_write) {
11293 +                       struct page *page;
11294 +                       char **my_checksum_locn = &__get_cpu_var(checksum_locn);
11295 +
11296 +                       pfn = get_next_bit_on(&io_map, pfn);
11297 +
11298 +                       /* Another thread could have beaten us to it. */
11299 +                       if (pfn == max_pfn + 1) {
11300 +                               if (atomic_read(&io_count)) {
11301 +                                       printk("Ran out of pfns but io_count "
11302 +                                               "is still %d.\n",
11303 +                                               atomic_read(&io_count));
11304 +                                       BUG();
11305 +                               }
11306 +                               break;
11307 +                       }
11308 +
11309 +                       my_io_index = io_finish_at -
11310 +                               atomic_sub_return(1, &io_count);
11311 +
11312 +                       orig_pfn = pfn;
11313 +                       write_pfn = pfn;
11314 +
11315 +                       /*
11316 +                        * Other_pfn is updated by all threads, so we're not
11317 +                        * writing the same page multiple times.
11318 +                        */
11319 +                       clear_dynpageflag(&io_map, pfn_to_page(pfn));
11320 +                       if (io_pageset == 1) {
11321 +                               other_pfn = get_next_bit_on(&pageset1_map,
11322 +                                               other_pfn);
11323 +                               write_pfn = other_pfn;
11324 +                       }
11325 +                       page = pfn_to_page(pfn);
11326 +
11327 +                       if (io_pageset == 2)
11328 +                               *my_checksum_locn =
11329 +                                       tuxonice_get_next_checksum();
11330 +
11331 +                       mutex_unlock(&io_mutex);
11332 +
11333 +                       if (io_pageset == 2 &&
11334 +                           tuxonice_calc_checksum(page, *my_checksum_locn))
11335 +                                       return 1;
11336 +
11337 +                       result = first_filter->write_page(write_pfn, page,
11338 +                                       PAGE_SIZE);
11339 +               } else {
11340 +                       my_io_index = io_finish_at -
11341 +                               atomic_sub_return(1, &io_count);
11342 +                       mutex_unlock(&io_mutex);
11343 +
11344 +                       /*
11345 +                        * Are we aborting? If so, don't submit any more I/O as
11346 +                        * resetting the resume_attempted flag (from ui.c) will
11347 +                        * clear the bdev flags, making this thread oops.
11348 +                        */
11349 +                       if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
11350 +                               atomic_dec(&toi_io_workers);
11351 +                               if (!atomic_read(&toi_io_workers))
11352 +                                       set_toi_state(TOI_IO_STOPPED);
11353 +                               while (1)
11354 +                                       schedule();
11355 +                       }
11356 +
11357 +                       result = first_filter->read_page(&write_pfn, buffer,
11358 +                                       &buf_size);
11359 +                       if (buf_size != PAGE_SIZE) {
11360 +                               abort_hibernate(TOI_FAILED_IO,
11361 +                                       "I/O pipeline returned %d bytes instead"
11362 +                                       " of %d.\n", buf_size, PAGE_SIZE);
11363 +                               mutex_lock(&io_mutex);
11364 +                               break;
11365 +                       }
11366 +               }
11367 +
11368 +               if (result) {
11369 +                       io_result = result;
11370 +                       if (io_write) {
11371 +                               printk(KERN_INFO "Write chunk returned %d.\n",
11372 +                                               result);
11373 +                               abort_hibernate(TOI_FAILED_IO,
11374 +                                       "Failed to write a chunk of the "
11375 +                                       "image.");
11376 +                               mutex_lock(&io_mutex);
11377 +                               break;
11378 +                       }
11379 +                       panic("Read chunk returned (%d)", result);
11380 +               }
11381 +
11382 +               /*
11383 +                * Discard reads of resaved pages while reading ps2
11384 +                * and unwanted pages while rereading ps2 when aborting.
11385 +                */
11386 +               if (!io_write && !PageResave(pfn_to_page(write_pfn))) {
11387 +                       struct page *final_page = pfn_to_page(write_pfn),
11388 +                                   *copy_page = final_page;
11389 +                       char *virt, *buffer_virt;
11390 +
11391 +                       if (io_pageset == 1 && !load_direct(final_page)) {
11392 +                               copy_page =
11393 +                                       copy_page_from_orig_page(final_page);
11394 +                               BUG_ON(!copy_page);
11395 +                       }
11396 +
11397 +                       if (test_dynpageflag(&io_map, final_page)) {
11398 +                               virt = kmap(copy_page);
11399 +                               buffer_virt = kmap(buffer);
11400 +                               memcpy(virt, buffer_virt, PAGE_SIZE);
11401 +                               kunmap(copy_page);
11402 +                               kunmap(buffer);
11403 +                               clear_dynpageflag(&io_map, final_page);
11404 +                       } else {
11405 +                               mutex_lock(&io_mutex);
11406 +                               atomic_inc(&io_count);
11407 +                               mutex_unlock(&io_mutex);
11408 +                       }
11409 +               }
11410 +
11411 +               temp = my_io_index + io_base - io_nextupdate;
11412 +
11413 +               if (my_io_index + io_base == io_nextupdate)
11414 +                       io_nextupdate = toi_update_status(my_io_index +
11415 +                               io_base, io_barmax, " %d/%d MB ",
11416 +                               MB(io_base+my_io_index+1), MB(io_barmax));
11417 +
11418 +               if (my_io_index == io_pc) {
11419 +                       printk("%s%d%%...", io_pc_step == 1 ? KERN_ERR : "",
11420 +                                       20 * io_pc_step);
11421 +                       io_pc_step++;
11422 +                       io_pc = io_finish_at * io_pc_step / 5;
11423 +               }
11424 +
11425 +               toi_cond_pause(0, NULL);
11426 +
11427 +               /*
11428 +                * Subtle: If there's less I/O still to be done than threads
11429 +                * running, quit. This stops us doing I/O beyond the end of
11430 +                * the image when reading.
11431 +                *
11432 +                * Possible race condition. Two threads could do the test at
11433 +                * the same time; one should exit and one should continue.
11434 +                * Therefore we take the mutex before comparing and exiting.
11435 +                */
11436 +
11437 +               mutex_lock(&io_mutex);
11438 +
11439 +       } while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
11440 +               !(io_write && test_result_state(TOI_ABORTED)));
11441 +
11442 +       if (atomic_dec_and_test(&toi_io_workers)) {
11443 +               toi_bio_queue_flusher_should_finish = 1;
11444 +               wake_up(&toi_io_queue_flusher);
11445 +       }
11446 +       mutex_unlock(&io_mutex);
11447 +
11448 +       toi__free_page(28, buffer);
11449 +
11450 +       return 0;
11451 +}
11452 +
11453 +int start_other_threads(void)
11454 +{
11455 +       int cpu, num_started = 0;
11456 +       struct task_struct *p;
11457 +
11458 +       for_each_online_cpu(cpu) {
11459 +               if (cpu == smp_processor_id())
11460 +                       continue;
11461 +
11462 +               p = kthread_create(worker_rw_loop, NULL, "ks2io/%d", cpu);
11463 +               if (IS_ERR(p)) {
11464 +                       printk("ks2io for %i failed\n", cpu);
11465 +                       continue;
11466 +               }
11467 +               kthread_bind(p, cpu);
11468 +               p->flags |= PF_MEMALLOC;
11469 +               wake_up_process(p);
11470 +               num_started++;
11471 +       }
11472 +
11473 +       return num_started;
11474 +}
11475 +
11476 +/*
11477 + * do_rw_loop
11478 + *
11479 + * The main I/O loop for reading or writing pages.
11480 + */
11481 +static int do_rw_loop(int write, int finish_at, struct dyn_pageflags *pageflags,
11482 +               int base, int barmax, int pageset)
11483 +{
11484 +       int index = 0, cpu, num_other_threads = 0;
11485 +
11486 +       if (!finish_at)
11487 +               return 0;
11488 +
11489 +       io_write = write;
11490 +       io_finish_at = finish_at;
11491 +       io_base = base;
11492 +       io_barmax = barmax;
11493 +       io_pageset = pageset;
11494 +       io_index = 0;
11495 +       io_pc = io_finish_at / 5;
11496 +       io_pc_step = 1;
11497 +       io_result = 0;
11498 +       io_nextupdate = base + 1;
11499 +       toi_bio_queue_flusher_should_finish = 0;
11500 +
11501 +       for_each_online_cpu(cpu) {
11502 +               per_cpu(last_sought, cpu) = NULL;
11503 +               per_cpu(last_low_page, cpu) = NULL;
11504 +               per_cpu(last_high_page, cpu) = NULL;
11505 +       }
11506 +
11507 +       /* Ensure all bits clear */
11508 +       clear_dyn_pageflags(&io_map);
11509 +
11510 +       /* Set the bits for the pages to write */
11511 +       pfn = get_next_bit_on(pageflags, max_pfn + 1);
11512 +
11513 +       while (pfn < max_pfn + 1 && index < finish_at) {
11514 +               set_dynpageflag(&io_map, pfn_to_page(pfn));
11515 +               pfn = get_next_bit_on(pageflags, pfn);
11516 +               index++;
11517 +       }
11518 +
11519 +       BUG_ON(index < finish_at);
11520 +
11521 +       atomic_set(&io_count, finish_at);
11522 +
11523 +       pfn = max_pfn + 1;
11524 +       other_pfn = pfn;
11525 +
11526 +       clear_toi_state(TOI_IO_STOPPED);
11527 +
11528 +       if (!test_action_state(TOI_NO_MULTITHREADED_IO))
11529 +               num_other_threads = start_other_threads();
11530 +
11531 +       if (!num_other_threads || !toiActiveAllocator->io_flusher ||
11532 +               test_action_state(TOI_NO_FLUSHER_THREAD))
11533 +               worker_rw_loop(NULL);
11534 +       else
11535 +               toiActiveAllocator->io_flusher(write);
11536 +
11537 +       while (atomic_read(&toi_io_workers))
11538 +               schedule();
11539 +
11540 +       set_toi_state(TOI_IO_STOPPED);
11541 +       if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
11542 +               while (1)
11543 +                       schedule();
11544 +       }
11545 +
11546 +       if (!io_result) {
11547 +               printk("done.\n");
11548 +
11549 +               toi_update_status(io_base + io_finish_at, io_barmax,
11550 +                               " %d/%d MB ",
11551 +                               MB(io_base + io_finish_at), MB(io_barmax));
11552 +       }
11553 +
11554 +       if (io_write && test_result_state(TOI_ABORTED))
11555 +               io_result = 1;
11556 +       else { /* All I/O done? */
11557 +               if  (get_next_bit_on(&io_map, max_pfn + 1) != max_pfn + 1) {
11558 +                       printk(KERN_INFO "Finished I/O loop but still work to "
11559 +                                       "do?\nFinish at = %d. io_count = %d.\n",
11560 +                                       finish_at, atomic_read(&io_count));
11561 +                       BUG();
11562 +               }
11563 +       }
11564 +
11565 +       return io_result;
11566 +}
11567 +
11568 +/* write_pageset()
11569 + *
11570 + * Description:        Write a pageset to disk.
11571 + * Arguments:  pagedir:        Which pagedir to write..
11572 + * Returns:    Zero on success or -1 on failure.
11573 + */
11574 +
11575 +int write_pageset(struct pagedir *pagedir)
11576 +{
11577 +       int finish_at, base = 0, start_time, end_time;
11578 +       int barmax = pagedir1.size + pagedir2.size;
11579 +       long error = 0;
11580 +       struct dyn_pageflags *pageflags;
11581 +
11582 +       /*
11583 +        * Even if there is nothing to read or write, the allocator
11584 +        * may need the init/cleanup for it's housekeeping.  (eg:
11585 +        * Pageset1 may start where pageset2 ends when writing).
11586 +        */
11587 +       finish_at = pagedir->size;
11588 +
11589 +       if (pagedir->id == 1) {
11590 +               toi_prepare_status(DONT_CLEAR_BAR,
11591 +                               "Writing kernel & process data...");
11592 +               base = pagedir2.size;
11593 +               if (test_action_state(TOI_TEST_FILTER_SPEED) ||
11594 +                   test_action_state(TOI_TEST_BIO))
11595 +                       pageflags = &pageset1_map;
11596 +               else
11597 +                       pageflags = &pageset1_copy_map;
11598 +       } else {
11599 +               toi_prepare_status(CLEAR_BAR, "Writing caches...");
11600 +               pageflags = &pageset2_map;
11601 +       }
11602 +
11603 +       start_time = jiffies;
11604 +
11605 +       if (rw_init_modules(1, pagedir->id)) {
11606 +               abort_hibernate(TOI_FAILED_MODULE_INIT,
11607 +                               "Failed to initialise modules for writing.");
11608 +               error = 1;
11609 +       }
11610 +
11611 +       if (!error)
11612 +               error = do_rw_loop(1, finish_at, pageflags, base, barmax,
11613 +                               pagedir->id);
11614 +
11615 +       if (rw_cleanup_modules(WRITE) && !error) {
11616 +               abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
11617 +                               "Failed to cleanup after writing.");
11618 +               error = 1;
11619 +       }
11620 +
11621 +       end_time = jiffies;
11622 +
11623 +       if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
11624 +               toi_bkd.toi_io_time[0][0] += finish_at,
11625 +               toi_bkd.toi_io_time[0][1] += (end_time - start_time);
11626 +       }
11627 +
11628 +       return error;
11629 +}
11630 +
11631 +/* read_pageset()
11632 + *
11633 + * Description:        Read a pageset from disk.
11634 + * Arguments:  whichtowrite:   Controls what debugging output is printed.
11635 + *             overwrittenpagesonly: Whether to read the whole pageset or
11636 + *             only part.
11637 + * Returns:    Zero on success or -1 on failure.
11638 + */
11639 +
11640 +static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
11641 +{
11642 +       int result = 0, base = 0, start_time, end_time;
11643 +       int finish_at = pagedir->size;
11644 +       int barmax = pagedir1.size + pagedir2.size;
11645 +       struct dyn_pageflags *pageflags;
11646 +
11647 +       if (pagedir->id == 1) {
11648 +               toi_prepare_status(CLEAR_BAR,
11649 +                               "Reading kernel & process data...");
11650 +               pageflags = &pageset1_map;
11651 +       } else {
11652 +               toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
11653 +               if (overwrittenpagesonly)
11654 +                       barmax = finish_at = min(pagedir1.size,
11655 +                                                pagedir2.size);
11656 +               else
11657 +                       base = pagedir1.size;
11658 +               pageflags = &pageset2_map;
11659 +       }
11660 +
11661 +       start_time = jiffies;
11662 +
11663 +       if (rw_init_modules(0, pagedir->id)) {
11664 +               toiActiveAllocator->remove_image();
11665 +               result = 1;
11666 +       } else
11667 +               result = do_rw_loop(0, finish_at, pageflags, base, barmax,
11668 +                               pagedir->id);
11669 +
11670 +       if (rw_cleanup_modules(READ) && !result) {
11671 +               abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
11672 +                               "Failed to cleanup after reading.");
11673 +               result = 1;
11674 +       }
11675 +
11676 +       /* Statistics */
11677 +       end_time = jiffies;
11678 +
11679 +       if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
11680 +               toi_bkd.toi_io_time[1][0] += finish_at,
11681 +               toi_bkd.toi_io_time[1][1] += (end_time - start_time);
11682 +       }
11683 +
11684 +       return result;
11685 +}
11686 +
11687 +/* write_module_configs()
11688 + *
11689 + * Description:        Store the configuration for each module in the image header.
11690 + * Returns:    Int: Zero on success, Error value otherwise.
11691 + */
11692 +static int write_module_configs(void)
11693 +{
11694 +       struct toi_module_ops *this_module;
11695 +       char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
11696 +       int len, index = 1;
11697 +       struct toi_module_header toi_module_header;
11698 +
11699 +       if (!buffer) {
11700 +               printk(KERN_INFO "Failed to allocate a buffer for saving "
11701 +                               "module configuration info.\n");
11702 +               return -ENOMEM;
11703 +       }
11704 +
11705 +       /*
11706 +        * We have to know which data goes with which module, so we at
11707 +        * least write a length of zero for a module. Note that we are
11708 +        * also assuming every module's config data takes <= PAGE_SIZE.
11709 +        */
11710 +
11711 +       /* For each module (in registration order) */
11712 +       list_for_each_entry(this_module, &toi_modules, module_list) {
11713 +               if (!this_module->enabled || !this_module->storage_needed ||
11714 +                   (this_module->type == WRITER_MODULE &&
11715 +                    toiActiveAllocator != this_module))
11716 +                       continue;
11717 +
11718 +               /* Get the data from the module */
11719 +               len = 0;
11720 +               if (this_module->save_config_info)
11721 +                       len = this_module->save_config_info(buffer);
11722 +
11723 +               /* Save the details of the module */
11724 +               toi_module_header.enabled = this_module->enabled;
11725 +               toi_module_header.type = this_module->type;
11726 +               toi_module_header.index = index++;
11727 +               strncpy(toi_module_header.name, this_module->name,
11728 +                                       sizeof(toi_module_header.name));
11729 +               toiActiveAllocator->rw_header_chunk(WRITE,
11730 +                               this_module,
11731 +                               (char *) &toi_module_header,
11732 +                               sizeof(toi_module_header));
11733 +
11734 +               /* Save the size of the data and any data returned */
11735 +               toiActiveAllocator->rw_header_chunk(WRITE,
11736 +                               this_module,
11737 +                               (char *) &len, sizeof(int));
11738 +               if (len)
11739 +                       toiActiveAllocator->rw_header_chunk(
11740 +                               WRITE, this_module, buffer, len);
11741 +       }
11742 +
11743 +       /* Write a blank header to terminate the list */
11744 +       toi_module_header.name[0] = '\0';
11745 +       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
11746 +                       (char *) &toi_module_header, sizeof(toi_module_header));
11747 +
11748 +       toi_free_page(22, (unsigned long) buffer);
11749 +       return 0;
11750 +}
11751 +
11752 +/* read_module_configs()
11753 + *
11754 + * Description:        Reload module configurations from the image header.
11755 + * Returns:    Int. Zero on success, error value otherwise.
11756 + */
11757 +
11758 +static int read_module_configs(void)
11759 +{
11760 +       struct toi_module_ops *this_module;
11761 +       char *buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
11762 +       int len, result = 0;
11763 +       struct toi_module_header toi_module_header;
11764 +
11765 +       if (!buffer) {
11766 +               printk("Failed to allocate a buffer for reloading module "
11767 +                               "configuration info.\n");
11768 +               return -ENOMEM;
11769 +       }
11770 +
11771 +       /* All modules are initially disabled. That way, if we have a module
11772 +        * loaded now that wasn't loaded when we hibernated, it won't be used
11773 +        * in trying to read the data.
11774 +        */
11775 +       list_for_each_entry(this_module, &toi_modules, module_list)
11776 +               this_module->enabled = 0;
11777 +
11778 +       /* Get the first module header */
11779 +       result = toiActiveAllocator->rw_header_chunk(READ, NULL,
11780 +                       (char *) &toi_module_header,
11781 +                       sizeof(toi_module_header));
11782 +       if (result) {
11783 +               printk("Failed to read the next module header.\n");
11784 +               toi_free_page(23, (unsigned long) buffer);
11785 +               return -EINVAL;
11786 +       }
11787 +
11788 +       /* For each module (in registration order) */
11789 +       while (toi_module_header.name[0]) {
11790 +
11791 +               /* Find the module */
11792 +               this_module =
11793 +                       toi_find_module_given_name(toi_module_header.name);
11794 +
11795 +               if (!this_module) {
11796 +                       /*
11797 +                        * Is it used? Only need to worry about filters. The
11798 +                        * active allocator must be loaded!
11799 +                        */
11800 +                       if (toi_module_header.enabled) {
11801 +                               toi_early_boot_message(1, TOI_CONTINUE_REQ,
11802 +                                       "It looks like we need module %s for "
11803 +                                       "reading the image but it hasn't been "
11804 +                                       "registered.\n",
11805 +                                       toi_module_header.name);
11806 +                               if (!(test_toi_state(TOI_CONTINUE_REQ))) {
11807 +                                       toi_free_page(23,
11808 +                                                       (unsigned long) buffer);
11809 +                                       return -EINVAL;
11810 +                               }
11811 +                       } else
11812 +                               printk(KERN_INFO "Module %s configuration data "
11813 +                                       "found, but the module hasn't "
11814 +                                       "registered. Looks like it was "
11815 +                                       "disabled, so we're ignoring its data.",
11816 +                                       toi_module_header.name);
11817 +               }
11818 +
11819 +               /* Get the length of the data (if any) */
11820 +               result = toiActiveAllocator->rw_header_chunk(READ, NULL,
11821 +                               (char *) &len, sizeof(int));
11822 +               if (result) {
11823 +                       printk("Failed to read the length of the module %s's"
11824 +                                       " configuration data.\n",
11825 +                                       toi_module_header.name);
11826 +                       toi_free_page(23, (unsigned long) buffer);
11827 +                       return -EINVAL;
11828 +               }
11829 +
11830 +               /* Read any data and pass to the module (if we found one) */
11831 +               if (len) {
11832 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
11833 +                                       buffer, len);
11834 +                       if (this_module) {
11835 +                               if (!this_module->save_config_info) {
11836 +                                       printk("Huh? Module %s appears to have "
11837 +                                               "a save_config_info, but not a "
11838 +                                               "load_config_info function!\n",
11839 +                                               this_module->name);
11840 +                               } else
11841 +                                       this_module->load_config_info(buffer,
11842 +                                                       len);
11843 +                       }
11844 +               }
11845 +
11846 +               if (this_module) {
11847 +                       /* Now move this module to the tail of its lists. This
11848 +                        * will put it in order. Any new modules will end up at
11849 +                        * the top of the lists. They should have been set to
11850 +                        * disabled when loaded (people will normally not edit
11851 +                        * an initrd to load a new module and then hibernate
11852 +                        * without using it!).
11853 +                        */
11854 +
11855 +                       toi_move_module_tail(this_module);
11856 +
11857 +                       /*
11858 +                        * We apply the disabled state; modules don't need to
11859 +                        * save whether they were disabled and if they do, we
11860 +                        * override them anyway.
11861 +                        */
11862 +                       this_module->enabled = toi_module_header.enabled;
11863 +               }
11864 +
11865 +               /* Get the next module header */
11866 +               result = toiActiveAllocator->rw_header_chunk(READ, NULL,
11867 +                               (char *) &toi_module_header,
11868 +                               sizeof(toi_module_header));
11869 +
11870 +               if (result) {
11871 +                       printk("Failed to read the next module header.\n");
11872 +                       toi_free_page(23, (unsigned long) buffer);
11873 +                       return -EINVAL;
11874 +               }
11875 +
11876 +       }
11877 +
11878 +       toi_free_page(23, (unsigned long) buffer);
11879 +       return 0;
11880 +}
11881 +
11882 +/* write_image_header()
11883 + *
11884 + * Description:        Write the image header after write the image proper.
11885 + * Returns:    Int. Zero on success or -1 on failure.
11886 + */
11887 +
11888 +int write_image_header(void)
11889 +{
11890 +       int ret;
11891 +       int total = pagedir1.size + pagedir2.size+2;
11892 +       char *header_buffer = NULL;
11893 +
11894 +       /* Now prepare to write the header */
11895 +       ret = toiActiveAllocator->write_header_init();
11896 +       if (ret) {
11897 +               abort_hibernate(TOI_FAILED_MODULE_INIT,
11898 +                               "Active allocator's write_header_init"
11899 +                               " function failed.");
11900 +               goto write_image_header_abort;
11901 +       }
11902 +
11903 +       /* Get a buffer */
11904 +       header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
11905 +       if (!header_buffer) {
11906 +               abort_hibernate(TOI_OUT_OF_MEMORY,
11907 +                       "Out of memory when trying to get page for header!");
11908 +               goto write_image_header_abort;
11909 +       }
11910 +
11911 +       /* Write hibernate header */
11912 +       if (fill_toi_header((struct toi_header *) header_buffer)) {
11913 +               abort_hibernate(TOI_OUT_OF_MEMORY,
11914 +                       "Failure to fill header information!");
11915 +               goto write_image_header_abort;
11916 +       }
11917 +       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
11918 +                       header_buffer, sizeof(struct toi_header));
11919 +
11920 +       toi_free_page(24, (unsigned long) header_buffer);
11921 +
11922 +       /* Write module configurations */
11923 +       ret = write_module_configs();
11924 +       if (ret) {
11925 +               abort_hibernate(TOI_FAILED_IO,
11926 +                               "Failed to write module configs.");
11927 +               goto write_image_header_abort;
11928 +       }
11929 +
11930 +       save_dyn_pageflags(&pageset1_map);
11931 +
11932 +       /* Flush data and let allocator cleanup */
11933 +       if (toiActiveAllocator->write_header_cleanup()) {
11934 +               abort_hibernate(TOI_FAILED_IO,
11935 +                               "Failed to cleanup writing header.");
11936 +               goto write_image_header_abort_no_cleanup;
11937 +       }
11938 +
11939 +       if (test_result_state(TOI_ABORTED))
11940 +               goto write_image_header_abort_no_cleanup;
11941 +
11942 +       toi_update_status(total, total, NULL);
11943 +
11944 +       return 0;
11945 +
11946 +write_image_header_abort:
11947 +       toiActiveAllocator->write_header_cleanup();
11948 +write_image_header_abort_no_cleanup:
11949 +       return -1;
11950 +}
11951 +
11952 +/* sanity_check()
11953 + *
11954 + * Description:        Perform a few checks, seeking to ensure that the kernel being
11955 + *             booted matches the one hibernated. They need to match so we can
11956 + *             be _sure_ things will work. It is not absolutely impossible for
11957 + *             resuming from a different kernel to work, just not assured.
11958 + * Arguments:  Struct toi_header. The header which was saved at hibernate
11959 + *             time.
11960 + */
11961 +static char *sanity_check(struct toi_header *sh)
11962 +{
11963 +       char *reason = check_swsusp_image_kernel((struct swsusp_info *) sh);
11964 +
11965 +       if (reason)
11966 +               return reason;
11967 +
11968 +       if (!test_action_state(TOI_IGNORE_ROOTFS)) {
11969 +               const struct super_block *sb;
11970 +               list_for_each_entry(sb, &super_blocks, s_list) {
11971 +                       if ((!(sb->s_flags & MS_RDONLY)) &&
11972 +                           (sb->s_type->fs_flags & FS_REQUIRES_DEV))
11973 +                               return "Device backed fs has been mounted "
11974 +                                       "rw prior to resume or initrd/ramfs "
11975 +                                       "is mounted rw.";
11976 +               }
11977 +       }
11978 +
11979 +       return 0;
11980 +}
11981 +
11982 +/* __read_pageset1
11983 + *
11984 + * Description:        Test for the existence of an image and attempt to load it.
11985 + * Returns:    Int. Zero if image found and pageset1 successfully loaded.
11986 + *             Error if no image found or loaded.
11987 + */
11988 +static int __read_pageset1(void)
11989 +{
11990 +       int i, result = 0;
11991 +       char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
11992 +            *sanity_error = NULL;
11993 +       struct toi_header *toi_header;
11994 +
11995 +       if (!header_buffer) {
11996 +               printk(KERN_INFO "Unable to allocate a page for reading the "
11997 +                               "signature.\n");
11998 +               return -ENOMEM;
11999 +       }
12000 +
12001 +       /* Check for an image */
12002 +       result = toiActiveAllocator->image_exists(1);
12003 +       if (!result) {
12004 +               result = -ENODATA;
12005 +               noresume_reset_modules();
12006 +               printk(KERN_INFO "TuxOnIce: No image found.\n");
12007 +               goto out;
12008 +       }
12009 +
12010 +       /*
12011 +        * Prepare the active allocator for reading the image header. The
12012 +        * activate allocator might read its own configuration.
12013 +        *
12014 +        * NB: This call may never return because there might be a signature
12015 +        * for a different image such that we warn the user and they choose
12016 +        * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
12017 +        * location of the image might be unavailable if it was stored on a
12018 +        * network connection).
12019 +        */
12020 +
12021 +       result = toiActiveAllocator->read_header_init();
12022 +       if (result) {
12023 +               printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
12024 +                               "image header.\n");
12025 +               goto out_remove_image;
12026 +       }
12027 +
12028 +       /* Check for noresume command line option */
12029 +       if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
12030 +               printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
12031 +                               "image.\n");
12032 +               goto out_remove_image;
12033 +       }
12034 +
12035 +       /* Check whether we've resumed before */
12036 +       if (test_toi_state(TOI_RESUMED_BEFORE)) {
12037 +               toi_early_boot_message(1, 0, NULL);
12038 +               if (!(test_toi_state(TOI_CONTINUE_REQ))) {
12039 +                       printk(KERN_INFO "TuxOnIce: Tried to resume before: "
12040 +                                       "Invalidated image.\n");
12041 +                       goto out_remove_image;
12042 +               }
12043 +       }
12044 +
12045 +       clear_toi_state(TOI_CONTINUE_REQ);
12046 +
12047 +       /* Read hibernate header */
12048 +       result = toiActiveAllocator->rw_header_chunk(READ, NULL,
12049 +                       header_buffer, sizeof(struct toi_header));
12050 +       if (result < 0) {
12051 +               printk("TuxOnIce: Failed to read the image signature.\n");
12052 +               goto out_remove_image;
12053 +       }
12054 +
12055 +       toi_header = (struct toi_header *) header_buffer;
12056 +
12057 +       /*
12058 +        * NB: This call may also result in a reboot rather than returning.
12059 +        */
12060 +
12061 +       sanity_error = sanity_check(toi_header);
12062 +       if (sanity_error) {
12063 +               toi_early_boot_message(1, TOI_CONTINUE_REQ,
12064 +                               sanity_error);
12065 +               printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
12066 +               goto out_remove_image;
12067 +       }
12068 +
12069 +       /*
12070 +        * We have an image and it looks like it will load okay.
12071 +        *
12072 +        * Get metadata from header. Don't override commandline parameters.
12073 +        *
12074 +        * We don't need to save the image size limit because it's not used
12075 +        * during resume and will be restored with the image anyway.
12076 +        */
12077 +
12078 +       memcpy((char *) &pagedir1,
12079 +               (char *) &toi_header->pagedir, sizeof(pagedir1));
12080 +       toi_result = toi_header->param0;
12081 +       toi_bkd.toi_action = toi_header->param1;
12082 +       toi_bkd.toi_debug_state = toi_header->param2;
12083 +       toi_bkd.toi_default_console_level = toi_header->param3;
12084 +       clear_toi_state(TOI_IGNORE_LOGLEVEL);
12085 +       pagedir2.size = toi_header->pageset_2_size;
12086 +       for (i = 0; i < 4; i++)
12087 +               toi_bkd.toi_io_time[i/2][i%2] =
12088 +                       toi_header->io_time[i/2][i%2];
12089 +       boot_kernel_data_buffer = toi_header->bkd;
12090 +
12091 +       /* Read module configurations */
12092 +       result = read_module_configs();
12093 +       if (result) {
12094 +               pagedir1.size = pagedir2.size = 0;
12095 +               printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
12096 +                               "configurations.\n");
12097 +               clear_action_state(TOI_KEEP_IMAGE);
12098 +               goto out_remove_image;
12099 +       }
12100 +
12101 +       toi_prepare_console();
12102 +
12103 +       set_toi_state(TOI_NOW_RESUMING);
12104 +
12105 +       if (pre_resume_freeze())
12106 +               goto out_reset_console;
12107 +
12108 +       toi_cond_pause(1, "About to read original pageset1 locations.");
12109 +
12110 +       /*
12111 +        * Read original pageset1 locations. These are the addresses we can't
12112 +        * use for the data to be restored.
12113 +        */
12114 +
12115 +       if (allocate_dyn_pageflags(&pageset1_map, 0) ||
12116 +           allocate_dyn_pageflags(&pageset1_copy_map, 0) ||
12117 +           allocate_dyn_pageflags(&io_map, 0))
12118 +               goto out_reset_console;
12119 +
12120 +       if (load_dyn_pageflags(&pageset1_map))
12121 +               goto out_reset_console;
12122 +
12123 +       /* Clean up after reading the header */
12124 +       result = toiActiveAllocator->read_header_cleanup();
12125 +       if (result) {
12126 +               printk("TuxOnIce: Failed to cleanup after reading the image "
12127 +                               "header.\n");
12128 +               goto out_reset_console;
12129 +       }
12130 +
12131 +       toi_cond_pause(1, "About to read pagedir.");
12132 +
12133 +       /*
12134 +        * Get the addresses of pages into which we will load the kernel to
12135 +        * be copied back
12136 +        */
12137 +       if (toi_get_pageset1_load_addresses()) {
12138 +               printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
12139 +                               "pageset1.\n");
12140 +               goto out_reset_console;
12141 +       }
12142 +
12143 +       /* Read the original kernel back */
12144 +       toi_cond_pause(1, "About to read pageset 1.");
12145 +
12146 +       if (read_pageset(&pagedir1, 0)) {
12147 +               toi_prepare_status(CLEAR_BAR, "Failed to read pageset 1.");
12148 +               result = -EIO;
12149 +               printk(KERN_INFO "TuxOnIce: Failed to get load pageset1.\n");
12150 +               goto out_reset_console;
12151 +       }
12152 +
12153 +       toi_cond_pause(1, "About to restore original kernel.");
12154 +       result = 0;
12155 +
12156 +       if (!test_action_state(TOI_KEEP_IMAGE) &&
12157 +           toiActiveAllocator->mark_resume_attempted)
12158 +               toiActiveAllocator->mark_resume_attempted(1);
12159 +
12160 +out:
12161 +       toi_free_page(25, (unsigned long) header_buffer);
12162 +       return result;
12163 +
12164 +out_reset_console:
12165 +       toi_cleanup_console();
12166 +
12167 +out_remove_image:
12168 +       free_dyn_pageflags(&pageset1_map);
12169 +       free_dyn_pageflags(&pageset1_copy_map);
12170 +       free_dyn_pageflags(&io_map);
12171 +       result = -EINVAL;
12172 +       if (!test_action_state(TOI_KEEP_IMAGE))
12173 +               toiActiveAllocator->remove_image();
12174 +       toiActiveAllocator->read_header_cleanup();
12175 +       noresume_reset_modules();
12176 +       goto out;
12177 +}
12178 +
12179 +/* read_pageset1()
12180 + *
12181 + * Description:        Attempt to read the header and pageset1 of a hibernate image.
12182 + *             Handle the outcome, complaining where appropriate.
12183 + */
12184 +
12185 +int read_pageset1(void)
12186 +{
12187 +       int error;
12188 +
12189 +       error = __read_pageset1();
12190 +
12191 +       if (error && error != -ENODATA && error != -EINVAL &&
12192 +                                       !test_result_state(TOI_ABORTED))
12193 +               abort_hibernate(TOI_IMAGE_ERROR,
12194 +                       "TuxOnIce: Error %d resuming\n", error);
12195 +
12196 +       return error;
12197 +}
12198 +
12199 +/*
12200 + * get_have_image_data()
12201 + */
12202 +static char *get_have_image_data(void)
12203 +{
12204 +       char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
12205 +       struct toi_header *toi_header;
12206 +
12207 +       if (!output_buffer) {
12208 +               printk(KERN_INFO "Output buffer null.\n");
12209 +               return NULL;
12210 +       }
12211 +
12212 +       /* Check for an image */
12213 +       if (!toiActiveAllocator->image_exists(1) ||
12214 +           toiActiveAllocator->read_header_init() ||
12215 +           toiActiveAllocator->rw_header_chunk(READ, NULL,
12216 +                       output_buffer, sizeof(struct toi_header))) {
12217 +               sprintf(output_buffer, "0\n");
12218 +               /*
12219 +                * From an initrd/ramfs, catting have_image and
12220 +                * getting a result of 0 is sufficient.
12221 +                */
12222 +               clear_toi_state(TOI_BOOT_TIME);
12223 +               goto out;
12224 +       }
12225 +
12226 +       toi_header = (struct toi_header *) output_buffer;
12227 +
12228 +       sprintf(output_buffer, "1\n%s\n%s\n",
12229 +                       toi_header->uts.machine,
12230 +                       toi_header->uts.version);
12231 +
12232 +       /* Check whether we've resumed before */
12233 +       if (test_toi_state(TOI_RESUMED_BEFORE))
12234 +               strcat(output_buffer, "Resumed before.\n");
12235 +
12236 +out:
12237 +       noresume_reset_modules();
12238 +       return output_buffer;
12239 +}
12240 +
12241 +/* read_pageset2()
12242 + *
12243 + * Description:        Read in part or all of pageset2 of an image, depending upon
12244 + *             whether we are hibernating and have only overwritten a portion
12245 + *             with pageset1 pages, or are resuming and need to read them
12246 + *             all.
12247 + * Arguments:  Int. Boolean. Read only pages which would have been
12248 + *             overwritten by pageset1?
12249 + * Returns:    Int. Zero if no error, otherwise the error value.
12250 + */
12251 +int read_pageset2(int overwrittenpagesonly)
12252 +{
12253 +       int result = 0;
12254 +
12255 +       if (!pagedir2.size)
12256 +               return 0;
12257 +
12258 +       result = read_pageset(&pagedir2, overwrittenpagesonly);
12259 +
12260 +       toi_update_status(100, 100, NULL);
12261 +       toi_cond_pause(1, "Pagedir 2 read.");
12262 +
12263 +       return result;
12264 +}
12265 +
12266 +/* image_exists_read
12267 + *
12268 + * Return 0 or 1, depending on whether an image is found.
12269 + * Incoming buffer is PAGE_SIZE and result is guaranteed
12270 + * to be far less than that, so we don't worry about
12271 + * overflow.
12272 + */
12273 +int image_exists_read(const char *page, int count)
12274 +{
12275 +       int len = 0;
12276 +       char *result;
12277 +
12278 +       if (toi_activate_storage(0))
12279 +               return count;
12280 +
12281 +       if (!test_toi_state(TOI_RESUME_DEVICE_OK))
12282 +               toi_attempt_to_parse_resume_device(0);
12283 +
12284 +       if (!toiActiveAllocator) {
12285 +               len = sprintf((char *) page, "-1\n");
12286 +       } else {
12287 +               result = get_have_image_data();
12288 +               if (result) {
12289 +                       len = sprintf((char *) page, "%s",  result);
12290 +                       toi_free_page(26, (unsigned long) result);
12291 +               }
12292 +       }
12293 +
12294 +       toi_deactivate_storage(0);
12295 +
12296 +       return len;
12297 +}
12298 +
12299 +/* image_exists_write
12300 + *
12301 + * Invalidate an image if one exists.
12302 + */
12303 +int image_exists_write(const char *buffer, int count)
12304 +{
12305 +       if (toi_activate_storage(0))
12306 +               return count;
12307 +
12308 +       if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
12309 +               toiActiveAllocator->remove_image();
12310 +
12311 +       toi_deactivate_storage(0);
12312 +
12313 +       clear_result_state(TOI_KEPT_IMAGE);
12314 +
12315 +       return count;
12316 +}
12317 +
12318 +#ifdef CONFIG_TOI_EXPORTS
12319 +EXPORT_SYMBOL_GPL(toi_attempt_to_parse_resume_device);
12320 +EXPORT_SYMBOL_GPL(attempt_to_parse_resume_device2);
12321 +EXPORT_SYMBOL_GPL(toi_io_workers);
12322 +EXPORT_SYMBOL_GPL(toi_io_queue_flusher);
12323 +EXPORT_SYMBOL_GPL(toi_bio_queue_flusher_should_finish);
12324 +#endif
12325 +
12326 diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
12327 new file mode 100644
12328 index 0000000..d4b470b
12329 --- /dev/null
12330 +++ b/kernel/power/tuxonice_io.h
12331 @@ -0,0 +1,71 @@
12332 +/*
12333 + * kernel/power/tuxonice_io.h
12334 + *
12335 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
12336 + *
12337 + * This file is released under the GPLv2.
12338 + *
12339 + * It contains high level IO routines for hibernating.
12340 + *
12341 + */
12342 +
12343 +#include <linux/utsname.h>
12344 +#include "tuxonice_pagedir.h"
12345 +#include "power.h"
12346 +
12347 +/* Non-module data saved in our image header */
12348 +struct toi_header {
12349 +       /*
12350 +        * Mirror struct swsusp_info, but without
12351 +        * the page aligned attribute
12352 +        */
12353 +       struct new_utsname uts;
12354 +       u32 version_code;
12355 +       unsigned long num_physpages;
12356 +       int cpus;
12357 +       unsigned long image_pages;
12358 +       unsigned long pages;
12359 +       unsigned long size;
12360 +
12361 +       /* Our own data */
12362 +       unsigned long orig_mem_free;
12363 +       int page_size;
12364 +       int pageset_2_size;
12365 +       int param0;
12366 +       int param1;
12367 +       int param2;
12368 +       int param3;
12369 +       int progress0;
12370 +       int progress1;
12371 +       int progress2;
12372 +       int progress3;
12373 +       int io_time[2][2];
12374 +       struct pagedir pagedir;
12375 +       dev_t root_fs;
12376 +       unsigned long bkd; /* Boot kernel data locn */
12377 +};
12378 +
12379 +extern int write_pageset(struct pagedir *pagedir);
12380 +extern int write_image_header(void);
12381 +extern int read_pageset1(void);
12382 +extern int read_pageset2(int overwrittenpagesonly);
12383 +
12384 +extern int toi_attempt_to_parse_resume_device(int quiet);
12385 +extern void attempt_to_parse_resume_device2(void);
12386 +extern void attempt_to_parse_alt_resume_param(void);
12387 +int image_exists_read(const char *page, int count);
12388 +int image_exists_write(const char *buffer, int count);
12389 +extern void save_restore_alt_param(int replace, int quiet);
12390 +extern atomic_t toi_io_workers;
12391 +
12392 +/* Args to save_restore_alt_param */
12393 +#define RESTORE 0
12394 +#define SAVE 1
12395 +
12396 +#define NOQUIET 0
12397 +#define QUIET 1
12398 +
12399 +extern dev_t name_to_dev_t(char *line);
12400 +
12401 +extern wait_queue_head_t toi_io_queue_flusher;
12402 +extern int toi_bio_queue_flusher_should_finish;
12403 diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
12404 new file mode 100644
12405 index 0000000..dcc7045
12406 --- /dev/null
12407 +++ b/kernel/power/tuxonice_modules.c
12408 @@ -0,0 +1,465 @@
12409 +/*
12410 + * kernel/power/tuxonice_modules.c
12411 + *
12412 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
12413 + *
12414 + */
12415 +
12416 +#include <linux/suspend.h>
12417 +#include <linux/module.h>
12418 +#include "tuxonice.h"
12419 +#include "tuxonice_modules.h"
12420 +#include "tuxonice_sysfs.h"
12421 +#include "tuxonice_ui.h"
12422 +
12423 +LIST_HEAD(toi_filters);
12424 +LIST_HEAD(toiAllocators);
12425 +LIST_HEAD(toi_modules);
12426 +
12427 +struct toi_module_ops *toiActiveAllocator;
12428 +int toi_num_filters;
12429 +int toiNumAllocators, toi_num_modules;
12430 +
12431 +/*
12432 + * toi_header_storage_for_modules
12433 + *
12434 + * Returns the amount of space needed to store configuration
12435 + * data needed by the modules prior to copying back the original
12436 + * kernel. We can exclude data for pageset2 because it will be
12437 + * available anyway once the kernel is copied back.
12438 + */
12439 +long toi_header_storage_for_modules(void)
12440 +{
12441 +       struct toi_module_ops *this_module;
12442 +       int bytes = 0;
12443 +
12444 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12445 +               if (!this_module->enabled ||
12446 +                   (this_module->type == WRITER_MODULE &&
12447 +                    toiActiveAllocator != this_module))
12448 +                       continue;
12449 +               if (this_module->storage_needed) {
12450 +                       int this = this_module->storage_needed() +
12451 +                               sizeof(struct toi_module_header) +
12452 +                               sizeof(int);
12453 +                       this_module->header_requested = this;
12454 +                       bytes += this;
12455 +               }
12456 +       }
12457 +
12458 +       /* One more for the empty terminator */
12459 +       return bytes + sizeof(struct toi_module_header);
12460 +}
12461 +
12462 +/*
12463 + * toi_memory_for_modules
12464 + *
12465 + * Returns the amount of memory requested by modules for
12466 + * doing their work during the cycle.
12467 + */
12468 +
12469 +long toi_memory_for_modules(int print_parts)
12470 +{
12471 +       long bytes = 0, result;
12472 +       struct toi_module_ops *this_module;
12473 +
12474 +       if (print_parts)
12475 +               printk(KERN_INFO "Memory for modules:\n===================\n");
12476 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12477 +               int this;
12478 +               if (!this_module->enabled)
12479 +                       continue;
12480 +               if (this_module->memory_needed) {
12481 +                       this = this_module->memory_needed();
12482 +                       if (print_parts)
12483 +                               printk(KERN_INFO "%10d bytes (%5ld pages) for "
12484 +                                               "module '%s'.\n", this,
12485 +                                               DIV_ROUND_UP(this, PAGE_SIZE),
12486 +                                               this_module->name);
12487 +                       bytes += this;
12488 +               }
12489 +       }
12490 +
12491 +       result = DIV_ROUND_UP(bytes, PAGE_SIZE);
12492 +       if (print_parts)
12493 +               printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
12494 +
12495 +       return result;
12496 +}
12497 +
12498 +/*
12499 + * toi_expected_compression_ratio
12500 + *
12501 + * Returns the compression ratio expected when saving the image.
12502 + */
12503 +
12504 +int toi_expected_compression_ratio(void)
12505 +{
12506 +       int ratio = 100;
12507 +       struct toi_module_ops *this_module;
12508 +
12509 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12510 +               if (!this_module->enabled)
12511 +                       continue;
12512 +               if (this_module->expected_compression)
12513 +                       ratio = ratio * this_module->expected_compression()
12514 +                               / 100;
12515 +       }
12516 +
12517 +       return ratio;
12518 +}
12519 +
12520 +/* toi_find_module_given_dir
12521 + * Functionality :     Return a module (if found), given a pointer
12522 + *                     to its directory name
12523 + */
12524 +
12525 +static struct toi_module_ops *toi_find_module_given_dir(char *name)
12526 +{
12527 +       struct toi_module_ops *this_module, *found_module = NULL;
12528 +
12529 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12530 +               if (!strcmp(name, this_module->directory)) {
12531 +                       found_module = this_module;
12532 +                       break;
12533 +               }
12534 +       }
12535 +
12536 +       return found_module;
12537 +}
12538 +
12539 +/* toi_find_module_given_name
12540 + * Functionality :     Return a module (if found), given a pointer
12541 + *                     to its name
12542 + */
12543 +
12544 +struct toi_module_ops *toi_find_module_given_name(char *name)
12545 +{
12546 +       struct toi_module_ops *this_module, *found_module = NULL;
12547 +
12548 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12549 +               if (!strcmp(name, this_module->name)) {
12550 +                       found_module = this_module;
12551 +                       break;
12552 +               }
12553 +       }
12554 +
12555 +       return found_module;
12556 +}
12557 +
12558 +/*
12559 + * toi_print_module_debug_info
12560 + * Functionality   : Get debugging info from modules into a buffer.
12561 + */
12562 +int toi_print_module_debug_info(char *buffer, int buffer_size)
12563 +{
12564 +       struct toi_module_ops *this_module;
12565 +       int len = 0;
12566 +
12567 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12568 +               if (!this_module->enabled)
12569 +                       continue;
12570 +               if (this_module->print_debug_info) {
12571 +                       int result;
12572 +                       result = this_module->print_debug_info(buffer + len,
12573 +                                       buffer_size - len);
12574 +                       len += result;
12575 +               }
12576 +       }
12577 +
12578 +       /* Ensure null terminated */
12579 +       buffer[buffer_size] = 0;
12580 +
12581 +       return len;
12582 +}
12583 +
12584 +/*
12585 + * toi_register_module
12586 + *
12587 + * Register a module.
12588 + */
12589 +int toi_register_module(struct toi_module_ops *module)
12590 +{
12591 +       int i;
12592 +       struct kobject *kobj;
12593 +
12594 +       module->enabled = 1;
12595 +
12596 +       if (toi_find_module_given_name(module->name)) {
12597 +               printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
12598 +                               " which is already registered.\n",
12599 +                               module->name);
12600 +               return -EBUSY;
12601 +       }
12602 +
12603 +       switch (module->type) {
12604 +       case FILTER_MODULE:
12605 +               list_add_tail(&module->type_list, &toi_filters);
12606 +               toi_num_filters++;
12607 +               break;
12608 +       case WRITER_MODULE:
12609 +               list_add_tail(&module->type_list, &toiAllocators);
12610 +               toiNumAllocators++;
12611 +               break;
12612 +       case MISC_MODULE:
12613 +       case MISC_HIDDEN_MODULE:
12614 +               break;
12615 +       default:
12616 +               printk("Hmmm. Module '%s' has an invalid type."
12617 +                       " It has been ignored.\n", module->name);
12618 +               return -EINVAL;
12619 +       }
12620 +       list_add_tail(&module->module_list, &toi_modules);
12621 +       toi_num_modules++;
12622 +
12623 +       if (!module->directory && !module->shared_directory)
12624 +               return 0;
12625 +
12626 +       /*
12627 +        * Modules may share a directory, but those with shared_dir
12628 +        * set must be loaded (via symbol dependencies) after parents
12629 +        * and unloaded beforehand.
12630 +        */
12631 +       if (module->shared_directory) {
12632 +               struct toi_module_ops *shared =
12633 +                       toi_find_module_given_dir(module->shared_directory);
12634 +               if (!shared) {
12635 +                       printk("TuxOnIce: Module %s wants to share %s's "
12636 +                                       "directory but %s isn't loaded.\n",
12637 +                                       module->name, module->shared_directory,
12638 +                                       module->shared_directory);
12639 +                       toi_unregister_module(module);
12640 +                       return -ENODEV;
12641 +               }
12642 +               kobj = shared->dir_kobj;
12643 +       } else {
12644 +               if (!strncmp(module->directory, "[ROOT]", 6))
12645 +                       kobj = tuxonice_kobj;
12646 +               else
12647 +                       kobj = make_toi_sysdir(module->directory);
12648 +       }
12649 +       module->dir_kobj = kobj;
12650 +       for (i = 0; i < module->num_sysfs_entries; i++) {
12651 +               int result = toi_register_sysfs_file(kobj,
12652 +                               &module->sysfs_data[i]);
12653 +               if (result)
12654 +                       return result;
12655 +       }
12656 +       return 0;
12657 +}
12658 +
12659 +/*
12660 + * toi_unregister_module
12661 + *
12662 + * Remove a module.
12663 + */
12664 +void toi_unregister_module(struct toi_module_ops *module)
12665 +{
12666 +       int i;
12667 +
12668 +       if (module->dir_kobj)
12669 +               for (i = 0; i < module->num_sysfs_entries; i++)
12670 +                       toi_unregister_sysfs_file(module->dir_kobj,
12671 +                                       &module->sysfs_data[i]);
12672 +
12673 +       if (!module->shared_directory && module->directory &&
12674 +                       strncmp(module->directory, "[ROOT]", 6))
12675 +               remove_toi_sysdir(module->dir_kobj);
12676 +
12677 +       switch (module->type) {
12678 +       case FILTER_MODULE:
12679 +               list_del(&module->type_list);
12680 +               toi_num_filters--;
12681 +               break;
12682 +       case WRITER_MODULE:
12683 +               list_del(&module->type_list);
12684 +               toiNumAllocators--;
12685 +               if (toiActiveAllocator == module) {
12686 +                       toiActiveAllocator = NULL;
12687 +                       clear_toi_state(TOI_CAN_RESUME);
12688 +                       clear_toi_state(TOI_CAN_HIBERNATE);
12689 +               }
12690 +               break;
12691 +       case MISC_MODULE:
12692 +       case MISC_HIDDEN_MODULE:
12693 +               break;
12694 +       default:
12695 +               printk("Hmmm. Module '%s' has an invalid type."
12696 +                       " It has been ignored.\n", module->name);
12697 +               return;
12698 +       }
12699 +       list_del(&module->module_list);
12700 +       toi_num_modules--;
12701 +}
12702 +
12703 +/*
12704 + * toi_move_module_tail
12705 + *
12706 + * Rearrange modules when reloading the config.
12707 + */
12708 +void toi_move_module_tail(struct toi_module_ops *module)
12709 +{
12710 +       switch (module->type) {
12711 +       case FILTER_MODULE:
12712 +               if (toi_num_filters > 1)
12713 +                       list_move_tail(&module->type_list, &toi_filters);
12714 +               break;
12715 +       case WRITER_MODULE:
12716 +               if (toiNumAllocators > 1)
12717 +                       list_move_tail(&module->type_list, &toiAllocators);
12718 +               break;
12719 +       case MISC_MODULE:
12720 +       case MISC_HIDDEN_MODULE:
12721 +               break;
12722 +       default:
12723 +               printk("Hmmm. Module '%s' has an invalid type."
12724 +                       " It has been ignored.\n", module->name);
12725 +               return;
12726 +       }
12727 +       if ((toi_num_filters + toiNumAllocators) > 1)
12728 +               list_move_tail(&module->module_list, &toi_modules);
12729 +}
12730 +
12731 +/*
12732 + * toi_initialise_modules
12733 + *
12734 + * Get ready to do some work!
12735 + */
12736 +int toi_initialise_modules(int starting_cycle, int early)
12737 +{
12738 +       struct toi_module_ops *this_module;
12739 +       int result;
12740 +
12741 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12742 +               this_module->header_requested = 0;
12743 +               this_module->header_used = 0;
12744 +               if (!this_module->enabled)
12745 +                       continue;
12746 +               if (this_module->early != early)
12747 +                       continue;
12748 +               if (this_module->initialise) {
12749 +                       toi_message(TOI_MEMORY, TOI_MEDIUM, 1,
12750 +                               "Initialising module %s.\n",
12751 +                               this_module->name);
12752 +                       result = this_module->initialise(starting_cycle);
12753 +                       if (result) {
12754 +                               toi_cleanup_modules(starting_cycle);
12755 +                               return result;
12756 +                       }
12757 +                       this_module->initialised = 1;
12758 +               }
12759 +       }
12760 +
12761 +       return 0;
12762 +}
12763 +
12764 +/*
12765 + * toi_cleanup_modules
12766 + *
12767 + * Tell modules the work is done.
12768 + */
12769 +void toi_cleanup_modules(int finishing_cycle)
12770 +{
12771 +       struct toi_module_ops *this_module;
12772 +
12773 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12774 +               if (!this_module->enabled || !this_module->initialised)
12775 +                       continue;
12776 +               if (this_module->cleanup) {
12777 +                       toi_message(TOI_MEMORY, TOI_MEDIUM, 1,
12778 +                               "Cleaning up module %s.\n",
12779 +                               this_module->name);
12780 +                       this_module->cleanup(finishing_cycle);
12781 +               }
12782 +               this_module->initialised = 0;
12783 +       }
12784 +}
12785 +
12786 +/*
12787 + * toi_get_next_filter
12788 + *
12789 + * Get the next filter in the pipeline.
12790 + */
12791 +struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
12792 +{
12793 +       struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
12794 +
12795 +       list_for_each_entry(this_filter, &toi_filters, type_list) {
12796 +               if (!this_filter->enabled)
12797 +                       continue;
12798 +               if ((last_filter == filter_sought) || (!filter_sought))
12799 +                       return this_filter;
12800 +               last_filter = this_filter;
12801 +       }
12802 +
12803 +       return toiActiveAllocator;
12804 +}
12805 +
12806 +/**
12807 + * toi_show_modules: Printk what support is loaded.
12808 + */
12809 +void toi_print_modules(void)
12810 +{
12811 +       struct toi_module_ops *this_module;
12812 +       int prev = 0;
12813 +
12814 +       printk("TuxOnIce " TOI_CORE_VERSION ", with support for");
12815 +
12816 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12817 +               if (this_module->type == MISC_HIDDEN_MODULE)
12818 +                       continue;
12819 +               printk("%s %s%s%s", prev ? "," : "",
12820 +                               this_module->enabled ? "" : "[",
12821 +                               this_module->name,
12822 +                               this_module->enabled ? "" : "]");
12823 +               prev = 1;
12824 +       }
12825 +
12826 +       printk(".\n");
12827 +}
12828 +
12829 +/* toi_get_modules
12830 + *
12831 + * Take a reference to modules so they can't go away under us.
12832 + */
12833 +
12834 +int toi_get_modules(void)
12835 +{
12836 +       struct toi_module_ops *this_module;
12837 +
12838 +       list_for_each_entry(this_module, &toi_modules, module_list) {
12839 +               struct toi_module_ops *this_module2;
12840 +
12841 +               if (try_module_get(this_module->module))
12842 +                       continue;
12843 +
12844 +               /* Failed! Reverse gets and return error */
12845 +               list_for_each_entry(this_module2, &toi_modules,
12846 +                               module_list) {
12847 +                       if (this_module == this_module2)
12848 +                               return -EINVAL;
12849 +                       module_put(this_module2->module);
12850 +               }
12851 +       }
12852 +       return 0;
12853 +}
12854 +
12855 +/* toi_put_modules
12856 + *
12857 + * Release our references to modules we used.
12858 + */
12859 +
12860 +void toi_put_modules(void)
12861 +{
12862 +       struct toi_module_ops *this_module;
12863 +
12864 +       list_for_each_entry(this_module, &toi_modules, module_list)
12865 +               module_put(this_module->module);
12866 +}
12867 +
12868 +#ifdef CONFIG_TOI_EXPORTS
12869 +EXPORT_SYMBOL_GPL(toi_register_module);
12870 +EXPORT_SYMBOL_GPL(toi_unregister_module);
12871 +EXPORT_SYMBOL_GPL(toi_get_next_filter);
12872 +EXPORT_SYMBOL_GPL(toiActiveAllocator);
12873 +#endif
12874 diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
12875 new file mode 100644
12876 index 0000000..6d9ea6b
12877 --- /dev/null
12878 +++ b/kernel/power/tuxonice_modules.h
12879 @@ -0,0 +1,176 @@
12880 +/*
12881 + * kernel/power/tuxonice_modules.h
12882 + *
12883 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
12884 + *
12885 + * This file is released under the GPLv2.
12886 + *
12887 + * It contains declarations for modules. Modules are additions to
12888 + * TuxOnIce that provide facilities such as image compression or
12889 + * encryption, backends for storage of the image and user interfaces.
12890 + *
12891 + */
12892 +
12893 +#ifndef TOI_MODULES_H
12894 +#define TOI_MODULES_H
12895 +
12896 +/* This is the maximum size we store in the image header for a module name */
12897 +#define TOI_MAX_MODULE_NAME_LENGTH 30
12898 +
12899 +/* Per-module metadata */
12900 +struct toi_module_header {
12901 +       char name[TOI_MAX_MODULE_NAME_LENGTH];
12902 +       int enabled;
12903 +       int type;
12904 +       int index;
12905 +       int data_length;
12906 +       unsigned long signature;
12907 +};
12908 +
12909 +enum {
12910 +       FILTER_MODULE,
12911 +       WRITER_MODULE,
12912 +       MISC_MODULE, /* Block writer, eg. */
12913 +       MISC_HIDDEN_MODULE,
12914 +};
12915 +
12916 +enum {
12917 +       TOI_ASYNC,
12918 +       TOI_SYNC
12919 +};
12920 +
12921 +struct toi_module_ops {
12922 +       /* Functions common to all modules */
12923 +       int type;
12924 +       char *name;
12925 +       char *directory;
12926 +       char *shared_directory;
12927 +       struct kobject *dir_kobj;
12928 +       struct module *module;
12929 +       int enabled, early, initialised;
12930 +       struct list_head module_list;
12931 +
12932 +       /* List of filters or allocators */
12933 +       struct list_head list, type_list;
12934 +
12935 +       /*
12936 +        * Requirements for memory and storage in
12937 +        * the image header..
12938 +        */
12939 +       int (*memory_needed) (void);
12940 +       int (*storage_needed) (void);
12941 +
12942 +       int header_requested, header_used;
12943 +
12944 +       int (*expected_compression) (void);
12945 +
12946 +       /*
12947 +        * Debug info
12948 +        */
12949 +       int (*print_debug_info) (char *buffer, int size);
12950 +       int (*save_config_info) (char *buffer);
12951 +       void (*load_config_info) (char *buffer, int len);
12952 +
12953 +       /*
12954 +        * Initialise & cleanup - general routines called
12955 +        * at the start and end of a cycle.
12956 +        */
12957 +       int (*initialise) (int starting_cycle);
12958 +       void (*cleanup) (int finishing_cycle);
12959 +
12960 +       /*
12961 +        * Calls for allocating storage (allocators only).
12962 +        *
12963 +        * Header space is allocated separately. Note that allocation
12964 +        * of space for the header might result in allocated space
12965 +        * being stolen from the main pool if there is no unallocated
12966 +        * space. We have to be able to allocate enough space for
12967 +        * the header. We can eat memory to ensure there is enough
12968 +        * for the main pool.
12969 +        */
12970 +
12971 +       int (*storage_available) (void);
12972 +       void (*reserve_header_space) (int space_requested);
12973 +       int (*allocate_storage) (int space_requested);
12974 +       int (*storage_allocated) (void);
12975 +       int (*release_storage) (void);
12976 +
12977 +       /*
12978 +        * Routines used in image I/O.
12979 +        */
12980 +       int (*rw_init) (int rw, int stream_number);
12981 +       int (*rw_cleanup) (int rw);
12982 +       int (*write_page) (unsigned long index, struct page *buffer_page,
12983 +                       unsigned int buf_size);
12984 +       int (*read_page) (unsigned long *index, struct page *buffer_page,
12985 +                       unsigned int *buf_size);
12986 +       void (*io_flusher) (int rw);
12987 +
12988 +       /* Reset module if image exists but reading aborted */
12989 +       void (*noresume_reset) (void);
12990 +
12991 +       /* Read and write the metadata */
12992 +       int (*write_header_init) (void);
12993 +       int (*write_header_cleanup) (void);
12994 +
12995 +       int (*read_header_init) (void);
12996 +       int (*read_header_cleanup) (void);
12997 +
12998 +       int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
12999 +                       char *buffer_start, int buffer_size);
13000 +
13001 +       int (*rw_header_chunk_noreadahead) (int rw,
13002 +                       struct toi_module_ops *owner, char *buffer_start,
13003 +                       int buffer_size);
13004 +
13005 +       /* Attempt to parse an image location */
13006 +       int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
13007 +
13008 +       /* Determine whether image exists that we can restore */
13009 +       int (*image_exists) (int quiet);
13010 +
13011 +       /* Mark the image as having tried to resume */
13012 +       int (*mark_resume_attempted) (int);
13013 +
13014 +       /* Destroy image if one exists */
13015 +       int (*remove_image) (void);
13016 +
13017 +       /* Sysfs Data */
13018 +       struct toi_sysfs_data *sysfs_data;
13019 +       int num_sysfs_entries;
13020 +};
13021 +
13022 +extern int toi_num_modules, toiNumAllocators;
13023 +
13024 +extern struct toi_module_ops *toiActiveAllocator;
13025 +extern struct list_head toi_filters, toiAllocators, toi_modules;
13026 +
13027 +extern void toi_prepare_console_modules(void);
13028 +extern void toi_cleanup_console_modules(void);
13029 +
13030 +extern struct toi_module_ops *toi_find_module_given_name(char *name);
13031 +extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
13032 +
13033 +extern int toi_register_module(struct toi_module_ops *module);
13034 +extern void toi_move_module_tail(struct toi_module_ops *module);
13035 +
13036 +extern long toi_header_storage_for_modules(void);
13037 +extern long toi_memory_for_modules(int print_parts);
13038 +extern int toi_expected_compression_ratio(void);
13039 +
13040 +extern int toi_print_module_debug_info(char *buffer, int buffer_size);
13041 +extern int toi_register_module(struct toi_module_ops *module);
13042 +extern void toi_unregister_module(struct toi_module_ops *module);
13043 +
13044 +extern int toi_initialise_modules(int starting_cycle, int early);
13045 +#define toi_initialise_modules_early(starting) \
13046 +       toi_initialise_modules(starting, 1)
13047 +#define toi_initialise_modules_late(starting) \
13048 +       toi_initialise_modules(starting, 0)
13049 +extern void toi_cleanup_modules(int finishing_cycle);
13050 +
13051 +extern void toi_print_modules(void);
13052 +
13053 +int toi_get_modules(void);
13054 +void toi_put_modules(void);
13055 +#endif
13056 diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
13057 new file mode 100644
13058 index 0000000..fd9e9c1
13059 --- /dev/null
13060 +++ b/kernel/power/tuxonice_netlink.c
13061 @@ -0,0 +1,327 @@
13062 +/*
13063 + * kernel/power/tuxonice_netlink.c
13064 + *
13065 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
13066 + *
13067 + * This file is released under the GPLv2.
13068 + *
13069 + * Functions for communicating with a userspace helper via netlink.
13070 + */
13071 +
13072 +
13073 +#include <linux/suspend.h>
13074 +#include "tuxonice_netlink.h"
13075 +#include "tuxonice.h"
13076 +#include "tuxonice_modules.h"
13077 +#include "tuxonice_alloc.h"
13078 +
13079 +struct user_helper_data *uhd_list;
13080 +
13081 +/*
13082 + * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
13083 + * none can be allocated).
13084 + */
13085 +static void toi_fill_skb_pool(struct user_helper_data *uhd)
13086 +{
13087 +       while (uhd->pool_level < uhd->pool_limit) {
13088 +               struct sk_buff *new_skb =
13089 +                       alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
13090 +
13091 +               if (!new_skb)
13092 +                       break;
13093 +
13094 +               new_skb->next = uhd->emerg_skbs;
13095 +               uhd->emerg_skbs = new_skb;
13096 +               uhd->pool_level++;
13097 +       }
13098 +}
13099 +
13100 +/*
13101 + * Try to allocate a single skb. If we can't get one, try to use one from
13102 + * our pool.
13103 + */
13104 +static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
13105 +{
13106 +       struct sk_buff *skb =
13107 +               alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
13108 +
13109 +       if (skb)
13110 +               return skb;
13111 +
13112 +       skb = uhd->emerg_skbs;
13113 +       if (skb) {
13114 +               uhd->pool_level--;
13115 +               uhd->emerg_skbs = skb->next;
13116 +               skb->next = NULL;
13117 +       }
13118 +
13119 +       return skb;
13120 +}
13121 +
13122 +static void put_skb(struct user_helper_data *uhd, struct sk_buff *skb)
13123 +{
13124 +       if (uhd->pool_level < uhd->pool_limit) {
13125 +               skb->next = uhd->emerg_skbs;
13126 +               uhd->emerg_skbs = skb;
13127 +       } else
13128 +               kfree_skb(skb);
13129 +}
13130 +
13131 +void toi_send_netlink_message(struct user_helper_data *uhd,
13132 +               int type, void *params, size_t len)
13133 +{
13134 +       struct sk_buff *skb;
13135 +       struct nlmsghdr *nlh;
13136 +       void *dest;
13137 +       struct task_struct *t;
13138 +
13139 +       if (uhd->pid == -1)
13140 +               return;
13141 +
13142 +       skb = toi_get_skb(uhd);
13143 +       if (!skb) {
13144 +               printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
13145 +               return;
13146 +       }
13147 +
13148 +       /* NLMSG_PUT contains a hidden goto nlmsg_failure */
13149 +       nlh = NLMSG_PUT(skb, 0, uhd->sock_seq, type, len);
13150 +       uhd->sock_seq++;
13151 +
13152 +       dest = NLMSG_DATA(nlh);
13153 +       if (params && len > 0)
13154 +               memcpy(dest, params, len);
13155 +
13156 +       netlink_unicast(uhd->nl, skb, uhd->pid, 0);
13157 +
13158 +       read_lock(&tasklist_lock);
13159 +       t = find_task_by_pid_type_ns(PIDTYPE_PID, uhd->pid, &init_pid_ns);
13160 +       if (!t) {
13161 +               read_unlock(&tasklist_lock);
13162 +               if (uhd->pid > -1)
13163 +                       printk(KERN_INFO "Hmm. Can't find the userspace task"
13164 +                               " %d.\n", uhd->pid);
13165 +               return;
13166 +       }
13167 +       wake_up_process(t);
13168 +       read_unlock(&tasklist_lock);
13169 +
13170 +       yield();
13171 +
13172 +       return;
13173 +
13174 +nlmsg_failure:
13175 +       if (skb)
13176 +               put_skb(uhd, skb);
13177 +}
13178 +EXPORT_SYMBOL_GPL(toi_send_netlink_message);
13179 +
13180 +static void send_whether_debugging(struct user_helper_data *uhd)
13181 +{
13182 +       static int is_debugging = 1;
13183 +
13184 +       toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
13185 +                       &is_debugging, sizeof(int));
13186 +}
13187 +
13188 +/*
13189 + * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
13190 + * are hibernating.
13191 + */
13192 +static int nl_set_nofreeze(struct user_helper_data *uhd, int pid)
13193 +{
13194 +       struct task_struct *t;
13195 +
13196 +       read_lock(&tasklist_lock);
13197 +       t = find_task_by_pid_type_ns(PIDTYPE_PID, pid, &init_pid_ns);
13198 +       if (!t) {
13199 +               read_unlock(&tasklist_lock);
13200 +               printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
13201 +                               pid);
13202 +               return -EINVAL;
13203 +       }
13204 +
13205 +       t->flags |= PF_NOFREEZE;
13206 +
13207 +       read_unlock(&tasklist_lock);
13208 +       uhd->pid = pid;
13209 +
13210 +       toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
13211 +
13212 +       return 0;
13213 +}
13214 +
13215 +/*
13216 + * Called when the userspace process has informed us that it's ready to roll.
13217 + */
13218 +static int nl_ready(struct user_helper_data *uhd, int version)
13219 +{
13220 +       if (version != uhd->interface_version) {
13221 +               printk(KERN_INFO "%s userspace process using invalid interface"
13222 +                               " version. Trying to continue without it.\n",
13223 +                               uhd->name);
13224 +               if (uhd->not_ready)
13225 +                       uhd->not_ready();
13226 +               return -EINVAL;
13227 +       }
13228 +
13229 +       complete(&uhd->wait_for_process);
13230 +
13231 +       return 0;
13232 +}
13233 +
13234 +void toi_netlink_close_complete(struct user_helper_data *uhd)
13235 +{
13236 +       if (uhd->nl) {
13237 +               sock_release(uhd->nl->sk_socket);
13238 +               uhd->nl = NULL;
13239 +       }
13240 +
13241 +       while (uhd->emerg_skbs) {
13242 +               struct sk_buff *next = uhd->emerg_skbs->next;
13243 +               kfree_skb(uhd->emerg_skbs);
13244 +               uhd->emerg_skbs = next;
13245 +       }
13246 +
13247 +       uhd->pid = -1;
13248 +}
13249 +EXPORT_SYMBOL_GPL(toi_netlink_close_complete);
13250 +
13251 +static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
13252 +               struct sk_buff *skb, struct nlmsghdr *nlh)
13253 +{
13254 +       int type;
13255 +       int *data;
13256 +       int err;
13257 +
13258 +       /* Let the more specific handler go first. It returns
13259 +        * 1 for valid messages that it doesn't know. */
13260 +       err = uhd->rcv_msg(skb, nlh);
13261 +       if (err != 1)
13262 +               return err;
13263 +
13264 +       type = nlh->nlmsg_type;
13265 +
13266 +       /* Only allow one task to receive NOFREEZE privileges */
13267 +       if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
13268 +               printk("Received extra nofreeze me requests.\n");
13269 +               return -EBUSY;
13270 +       }
13271 +
13272 +       data = (int *)NLMSG_DATA(nlh);
13273 +
13274 +       switch (type) {
13275 +       case NETLINK_MSG_NOFREEZE_ME:
13276 +               return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
13277 +       case NETLINK_MSG_GET_DEBUGGING:
13278 +               send_whether_debugging(uhd);
13279 +               return 0;
13280 +       case NETLINK_MSG_READY:
13281 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) {
13282 +                       printk(KERN_INFO "Invalid ready mesage.\n");
13283 +                       return -EINVAL;
13284 +               }
13285 +               return nl_ready(uhd, *data);
13286 +       case NETLINK_MSG_CLEANUP:
13287 +               toi_netlink_close_complete(uhd);
13288 +               return 0;
13289 +       }
13290 +
13291 +       return -EINVAL;
13292 +}
13293 +
13294 +static void toi_user_rcv_skb(struct sk_buff *skb)
13295 +{
13296 +       int err;
13297 +       struct nlmsghdr *nlh;
13298 +       struct user_helper_data *uhd = uhd_list;
13299 +
13300 +       while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
13301 +               uhd = uhd->next;
13302 +
13303 +       if (!uhd)
13304 +               return;
13305 +
13306 +       while (skb->len >= NLMSG_SPACE(0)) {
13307 +               u32 rlen;
13308 +
13309 +               nlh = (struct nlmsghdr *) skb->data;
13310 +               if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
13311 +                       return;
13312 +
13313 +               rlen = NLMSG_ALIGN(nlh->nlmsg_len);
13314 +               if (rlen > skb->len)
13315 +                       rlen = skb->len;
13316 +
13317 +               err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
13318 +               if (err)
13319 +                       netlink_ack(skb, nlh, err);
13320 +               else if (nlh->nlmsg_flags & NLM_F_ACK)
13321 +                       netlink_ack(skb, nlh, 0);
13322 +               skb_pull(skb, rlen);
13323 +       }
13324 +}
13325 +
13326 +static int netlink_prepare(struct user_helper_data *uhd)
13327 +{
13328 +       uhd->next = uhd_list;
13329 +       uhd_list = uhd;
13330 +
13331 +       uhd->sock_seq = 0x42c0ffee;
13332 +       uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, 0,
13333 +                       toi_user_rcv_skb, NULL, THIS_MODULE);
13334 +       if (!uhd->nl) {
13335 +               printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
13336 +                               uhd->name);
13337 +               return -ENOMEM;
13338 +       }
13339 +
13340 +       toi_fill_skb_pool(uhd);
13341 +
13342 +       return 0;
13343 +}
13344 +
13345 +void toi_netlink_close(struct user_helper_data *uhd)
13346 +{
13347 +       struct task_struct *t;
13348 +
13349 +       read_lock(&tasklist_lock);
13350 +       t = find_task_by_pid_type_ns(PIDTYPE_PID, uhd->pid, &init_pid_ns);
13351 +       if (t)
13352 +               t->flags &= ~PF_NOFREEZE;
13353 +       read_unlock(&tasklist_lock);
13354 +
13355 +       toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
13356 +}
13357 +EXPORT_SYMBOL_GPL(toi_netlink_close);
13358 +
13359 +int toi_netlink_setup(struct user_helper_data *uhd)
13360 +{
13361 +       /* In case userui didn't cleanup properly on us */
13362 +       toi_netlink_close_complete(uhd);
13363 +
13364 +       if (netlink_prepare(uhd) < 0) {
13365 +               printk(KERN_INFO "Netlink prepare failed.\n");
13366 +               return 1;
13367 +       }
13368 +
13369 +       if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
13370 +                               UMH_WAIT_EXEC) < 0) {
13371 +               printk(KERN_INFO "Launch userspace program failed.\n");
13372 +               toi_netlink_close_complete(uhd);
13373 +               return 1;
13374 +       }
13375 +
13376 +       /* Wait 2 seconds for the userspace process to make contact */
13377 +       wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
13378 +
13379 +       if (uhd->pid == -1) {
13380 +               printk(KERN_INFO "%s: Failed to contact userspace process.\n",
13381 +                               uhd->name);
13382 +               toi_netlink_close_complete(uhd);
13383 +               return 1;
13384 +       }
13385 +
13386 +       return 0;
13387 +}
13388 +EXPORT_SYMBOL_GPL(toi_netlink_setup);
13389 diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
13390 new file mode 100644
13391 index 0000000..1436ac3
13392 --- /dev/null
13393 +++ b/kernel/power/tuxonice_netlink.h
13394 @@ -0,0 +1,61 @@
13395 +/*
13396 + * kernel/power/tuxonice_netlink.h
13397 + *
13398 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
13399 + *
13400 + * This file is released under the GPLv2.
13401 + *
13402 + * Declarations for functions for communicating with a userspace helper
13403 + * via netlink.
13404 + */
13405 +
13406 +#include <linux/netlink.h>
13407 +#include <net/sock.h>
13408 +
13409 +#define NETLINK_MSG_BASE 0x10
13410 +
13411 +#define NETLINK_MSG_READY 0x10
13412 +#define        NETLINK_MSG_NOFREEZE_ME 0x16
13413 +#define NETLINK_MSG_GET_DEBUGGING 0x19
13414 +#define NETLINK_MSG_CLEANUP 0x24
13415 +#define NETLINK_MSG_NOFREEZE_ACK 0x27
13416 +#define NETLINK_MSG_IS_DEBUGGING 0x28
13417 +
13418 +struct user_helper_data {
13419 +       int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
13420 +       void (*not_ready) (void);
13421 +       struct sock *nl;
13422 +       u32 sock_seq;
13423 +       pid_t pid;
13424 +       char *comm;
13425 +       char program[256];
13426 +       int pool_level;
13427 +       int pool_limit;
13428 +       struct sk_buff *emerg_skbs;
13429 +       int skb_size;
13430 +       int netlink_id;
13431 +       char *name;
13432 +       struct user_helper_data *next;
13433 +       struct completion wait_for_process;
13434 +       int interface_version;
13435 +       int must_init;
13436 +};
13437 +
13438 +#ifdef CONFIG_NET
13439 +int toi_netlink_setup(struct user_helper_data *uhd);
13440 +void toi_netlink_close(struct user_helper_data *uhd);
13441 +void toi_send_netlink_message(struct user_helper_data *uhd,
13442 +               int type, void *params, size_t len);
13443 +void toi_netlink_close_complete(struct user_helper_data *uhd);
13444 +#else
13445 +static inline int toi_netlink_setup(struct user_helper_data *uhd)
13446 +{
13447 +       return 0;
13448 +}
13449 +
13450 +static inline void toi_netlink_close(struct user_helper_data *uhd) { };
13451 +static inline void toi_send_netlink_message(struct user_helper_data *uhd,
13452 +               int type, void *params, size_t len) { };
13453 +static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
13454 +       { };
13455 +#endif
13456 diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
13457 new file mode 100644
13458 index 0000000..93b65cd
13459 --- /dev/null
13460 +++ b/kernel/power/tuxonice_pagedir.c
13461 @@ -0,0 +1,347 @@
13462 +/*
13463 + * kernel/power/tuxonice_pagedir.c
13464 + *
13465 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
13466 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
13467 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
13468 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
13469 + *
13470 + * This file is released under the GPLv2.
13471 + *
13472 + * Routines for handling pagesets.
13473 + * Note that pbes aren't actually stored as such. They're stored as
13474 + * bitmaps and extents.
13475 + */
13476 +
13477 +#include <linux/suspend.h>
13478 +#include <linux/highmem.h>
13479 +#include <linux/bootmem.h>
13480 +#include <linux/hardirq.h>
13481 +#include <linux/sched.h>
13482 +#include <asm/tlbflush.h>
13483 +
13484 +#include "tuxonice_pageflags.h"
13485 +#include "tuxonice_ui.h"
13486 +#include "tuxonice_pagedir.h"
13487 +#include "tuxonice_prepare_image.h"
13488 +#include "tuxonice.h"
13489 +#include "power.h"
13490 +#include "tuxonice_builtin.h"
13491 +#include "tuxonice_alloc.h"
13492 +
13493 +static int ptoi_pfn;
13494 +static struct pbe *this_low_pbe;
13495 +static struct pbe **last_low_pbe_ptr;
13496 +
13497 +void toi_reset_alt_image_pageset2_pfn(void)
13498 +{
13499 +       ptoi_pfn = max_pfn + 1;
13500 +}
13501 +
13502 +static struct page *first_conflicting_page;
13503 +
13504 +/*
13505 + * free_conflicting_pages
13506 + */
13507 +
13508 +void free_conflicting_pages(void)
13509 +{
13510 +       while (first_conflicting_page) {
13511 +               struct page *next =
13512 +                       *((struct page **) kmap(first_conflicting_page));
13513 +               kunmap(first_conflicting_page);
13514 +               toi__free_page(29, first_conflicting_page);
13515 +               first_conflicting_page = next;
13516 +       }
13517 +}
13518 +
13519 +/* __toi_get_nonconflicting_page
13520 + *
13521 + * Description: Gets order zero pages that won't be overwritten
13522 + *             while copying the original pages.
13523 + */
13524 +
13525 +struct page *___toi_get_nonconflicting_page(int can_be_highmem)
13526 +{
13527 +       struct page *page;
13528 +       int flags = TOI_ATOMIC_GFP;
13529 +       if (can_be_highmem)
13530 +               flags |= __GFP_HIGHMEM;
13531 +
13532 +
13533 +       if (test_toi_state(TOI_LOADING_ALT_IMAGE) && pageset2_map.bitmap &&
13534 +                               (ptoi_pfn < (max_pfn + 2))) {
13535 +               /*
13536 +                * ptoi_pfn = max_pfn + 1 when yet to find first ps2 pfn that
13537 +                * can be used.
13538 +                *         = 0..max_pfn when going through list.
13539 +                *         = max_pfn + 2 when gone through whole list.
13540 +                */
13541 +               do {
13542 +                       ptoi_pfn = get_next_bit_on(&pageset2_map, ptoi_pfn);
13543 +                       if (ptoi_pfn <= max_pfn) {
13544 +                               page = pfn_to_page(ptoi_pfn);
13545 +                               if (!PagePageset1(page) &&
13546 +                                   (can_be_highmem || !PageHighMem(page)))
13547 +                                       return page;
13548 +                       } else
13549 +                               ptoi_pfn++;
13550 +               } while (ptoi_pfn < max_pfn);
13551 +       }
13552 +
13553 +       do {
13554 +               page = toi_alloc_page(29, flags);
13555 +               if (!page) {
13556 +                       printk(KERN_INFO "Failed to get nonconflicting "
13557 +                                       "page.\n");
13558 +                       return 0;
13559 +               }
13560 +               if (PagePageset1(page)) {
13561 +                       struct page **next = (struct page **) kmap(page);
13562 +                       *next = first_conflicting_page;
13563 +                       first_conflicting_page = page;
13564 +                       kunmap(page);
13565 +               }
13566 +       } while (PagePageset1(page));
13567 +
13568 +       return page;
13569 +}
13570 +
13571 +unsigned long __toi_get_nonconflicting_page(void)
13572 +{
13573 +       struct page *page = ___toi_get_nonconflicting_page(0);
13574 +       return page ? (unsigned long) page_address(page) : 0;
13575 +}
13576 +
13577 +struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
13578 +               int highmem)
13579 +{
13580 +       if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
13581 +                    + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
13582 +               struct page *new_page =
13583 +                       ___toi_get_nonconflicting_page(highmem);
13584 +               if (!new_page)
13585 +                       return ERR_PTR(-ENOMEM);
13586 +               this_pbe = (struct pbe *) kmap(new_page);
13587 +               memset(this_pbe, 0, PAGE_SIZE);
13588 +               *page_ptr = new_page;
13589 +       } else
13590 +               this_pbe++;
13591 +
13592 +       return this_pbe;
13593 +}
13594 +
13595 +/* get_pageset1_load_addresses
13596 + *
13597 + * Description: We check here that pagedir & pages it points to won't collide
13598 + *             with pages where we're going to restore from the loaded pages
13599 + *             later.
13600 + * Returns:    Zero on success, one if couldn't find enough pages (shouldn't
13601 + *             happen).
13602 + */
13603 +
13604 +int toi_get_pageset1_load_addresses(void)
13605 +{
13606 +       int pfn, highallocd = 0, lowallocd = 0;
13607 +       int low_needed = pagedir1.size - get_highmem_size(pagedir1);
13608 +       int high_needed = get_highmem_size(pagedir1);
13609 +       int low_pages_for_highmem = 0;
13610 +       unsigned long flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
13611 +       struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
13612 +                   *low_pbe_page;
13613 +       struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
13614 +                  *this_high_pbe = NULL;
13615 +       int orig_low_pfn = max_pfn + 1, orig_high_pfn = max_pfn + 1;
13616 +       int high_pbes_done = 0, low_pbes_done = 0;
13617 +       int low_direct = 0, high_direct = 0;
13618 +       int high_to_free, low_to_free;
13619 +
13620 +       last_low_pbe_ptr = &restore_pblist;
13621 +
13622 +       /* First, allocate pages for the start of our pbe lists. */
13623 +       if (high_needed) {
13624 +               high_pbe_page = ___toi_get_nonconflicting_page(1);
13625 +               if (!high_pbe_page)
13626 +                       return 1;
13627 +               this_high_pbe = (struct pbe *) kmap(high_pbe_page);
13628 +               memset(this_high_pbe, 0, PAGE_SIZE);
13629 +       }
13630 +
13631 +       low_pbe_page = ___toi_get_nonconflicting_page(0);
13632 +       if (!low_pbe_page)
13633 +               return 1;
13634 +       this_low_pbe = (struct pbe *) page_address(low_pbe_page);
13635 +
13636 +       /*
13637 +        * Next, allocate all possible memory to find where we can
13638 +        * load data directly into destination pages. I'd like to do
13639 +        * this in bigger chunks, but then we can't free pages
13640 +        * individually later.
13641 +        */
13642 +
13643 +       do {
13644 +               page = toi_alloc_page(30, flags);
13645 +               if (page)
13646 +                       SetPagePageset1Copy(page);
13647 +       } while (page);
13648 +
13649 +       /*
13650 +        * Find out how many high- and lowmem pages we allocated above,
13651 +        * and how many pages we can reload directly to their original
13652 +        * location.
13653 +        */
13654 +       BITMAP_FOR_EACH_SET(&pageset1_copy_map, pfn) {
13655 +               int is_high;
13656 +               page = pfn_to_page(pfn);
13657 +               is_high = PageHighMem(page);
13658 +
13659 +               if (PagePageset1(page)) {
13660 +                       if (test_action_state(TOI_NO_DIRECT_LOAD)) {
13661 +                               ClearPagePageset1Copy(page);
13662 +                               toi__free_page(30, page);
13663 +                               continue;
13664 +                       } else {
13665 +                               if (is_high)
13666 +                                       high_direct++;
13667 +                               else
13668 +                                       low_direct++;
13669 +                       }
13670 +               } else {
13671 +                       if (is_high)
13672 +                               highallocd++;
13673 +                       else
13674 +                               lowallocd++;
13675 +               }
13676 +       }
13677 +
13678 +       high_needed -= high_direct;
13679 +       low_needed -= low_direct;
13680 +
13681 +       /*
13682 +        * Do we need to use some lowmem pages for the copies of highmem
13683 +        * pages?
13684 +        */
13685 +       if (high_needed > highallocd) {
13686 +               low_pages_for_highmem = high_needed - highallocd;
13687 +               high_needed -= low_pages_for_highmem;
13688 +               low_needed += low_pages_for_highmem;
13689 +       }
13690 +
13691 +       high_to_free = highallocd - high_needed;
13692 +       low_to_free = lowallocd - low_needed;
13693 +
13694 +       /*
13695 +        * Now generate our pbes (which will be used for the atomic restore,
13696 +        * and free unneeded pages.
13697 +        */
13698 +       BITMAP_FOR_EACH_SET(&pageset1_copy_map, pfn) {
13699 +               int is_high;
13700 +               page = pfn_to_page(pfn);
13701 +               is_high = PageHighMem(page);
13702 +
13703 +               if (PagePageset1(page))
13704 +                       continue;
13705 +
13706 +               /* Free the page? */
13707 +               if ((is_high && high_to_free) ||
13708 +                   (!is_high && low_to_free)) {
13709 +                       ClearPagePageset1Copy(page);
13710 +                       toi__free_page(30, page);
13711 +                       if (is_high)
13712 +                               high_to_free--;
13713 +                       else
13714 +                               low_to_free--;
13715 +                       continue;
13716 +               }
13717 +
13718 +               /* Nope. We're going to use this page. Add a pbe. */
13719 +               if (is_high || low_pages_for_highmem) {
13720 +                       struct page *orig_page;
13721 +                       high_pbes_done++;
13722 +                       if (!is_high)
13723 +                               low_pages_for_highmem--;
13724 +                       do {
13725 +                               orig_high_pfn = get_next_bit_on(&pageset1_map,
13726 +                                               orig_high_pfn);
13727 +                               BUG_ON(orig_high_pfn > max_pfn);
13728 +                               orig_page = pfn_to_page(orig_high_pfn);
13729 +                       } while (!PageHighMem(orig_page) ||
13730 +                                       load_direct(orig_page));
13731 +
13732 +                       this_high_pbe->orig_address = orig_page;
13733 +                       this_high_pbe->address = page;
13734 +                       this_high_pbe->next = NULL;
13735 +                       if (last_high_pbe_page != high_pbe_page) {
13736 +                               *last_high_pbe_ptr =
13737 +                                       (struct pbe *) high_pbe_page;
13738 +                               if (!last_high_pbe_page)
13739 +                                       last_high_pbe_page = high_pbe_page;
13740 +                       } else
13741 +                               *last_high_pbe_ptr = this_high_pbe;
13742 +                       last_high_pbe_ptr = &this_high_pbe->next;
13743 +                       if (last_high_pbe_page != high_pbe_page) {
13744 +                               kunmap(last_high_pbe_page);
13745 +                               last_high_pbe_page = high_pbe_page;
13746 +                       }
13747 +                       this_high_pbe = get_next_pbe(&high_pbe_page,
13748 +                                       this_high_pbe, 1);
13749 +                       if (IS_ERR(this_high_pbe)) {
13750 +                               printk(KERN_INFO
13751 +                                               "This high pbe is an error.\n");
13752 +                               return -ENOMEM;
13753 +                       }
13754 +               } else {
13755 +                       struct page *orig_page;
13756 +                       low_pbes_done++;
13757 +                       do {
13758 +                               orig_low_pfn = get_next_bit_on(&pageset1_map,
13759 +                                               orig_low_pfn);
13760 +                               BUG_ON(orig_low_pfn > max_pfn);
13761 +                               orig_page = pfn_to_page(orig_low_pfn);
13762 +                       } while (PageHighMem(orig_page) ||
13763 +                                       load_direct(orig_page));
13764 +
13765 +                       this_low_pbe->orig_address = page_address(orig_page);
13766 +                       this_low_pbe->address = page_address(page);
13767 +                       this_low_pbe->next = NULL;
13768 +                       *last_low_pbe_ptr = this_low_pbe;
13769 +                       last_low_pbe_ptr = &this_low_pbe->next;
13770 +                       this_low_pbe = get_next_pbe(&low_pbe_page,
13771 +                                       this_low_pbe, 0);
13772 +                       if (IS_ERR(this_low_pbe)) {
13773 +                               printk(KERN_INFO "this_low_pbe is an error.\n");
13774 +                               return -ENOMEM;
13775 +                       }
13776 +               }
13777 +       }
13778 +
13779 +       if (high_pbe_page)
13780 +               kunmap(high_pbe_page);
13781 +
13782 +       if (last_high_pbe_page != high_pbe_page) {
13783 +               if (last_high_pbe_page)
13784 +                       kunmap(last_high_pbe_page);
13785 +               toi__free_page(29, high_pbe_page);
13786 +       }
13787 +
13788 +       free_conflicting_pages();
13789 +
13790 +       return 0;
13791 +}
13792 +
13793 +int add_boot_kernel_data_pbe(void)
13794 +{
13795 +       this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
13796 +       if (!this_low_pbe->address) {
13797 +               printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
13798 +               return -ENOMEM;
13799 +       }
13800 +
13801 +       toi_bkd.size = sizeof(toi_bkd);
13802 +       memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
13803 +
13804 +       *last_low_pbe_ptr = this_low_pbe;
13805 +       this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
13806 +       this_low_pbe->next = NULL;
13807 +       return 0;
13808 +}
13809 diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
13810 new file mode 100644
13811 index 0000000..081e744
13812 --- /dev/null
13813 +++ b/kernel/power/tuxonice_pagedir.h
13814 @@ -0,0 +1,50 @@
13815 +/*
13816 + * kernel/power/tuxonice_pagedir.h
13817 + *
13818 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
13819 + *
13820 + * This file is released under the GPLv2.
13821 + *
13822 + * Declarations for routines for handling pagesets.
13823 + */
13824 +
13825 +#ifndef KERNEL_POWER_PAGEDIR_H
13826 +#define KERNEL_POWER_PAGEDIR_H
13827 +
13828 +/* Pagedir
13829 + *
13830 + * Contains the metadata for a set of pages saved in the image.
13831 + */
13832 +
13833 +struct pagedir {
13834 +       int id;
13835 +       long size;
13836 +#ifdef CONFIG_HIGHMEM
13837 +       long size_high;
13838 +#endif
13839 +};
13840 +
13841 +#ifdef CONFIG_HIGHMEM
13842 +#define get_highmem_size(pagedir) (pagedir.size_high)
13843 +#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
13844 +#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
13845 +#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
13846 +#else
13847 +#define get_highmem_size(pagedir) (0)
13848 +#define set_highmem_size(pagedir, sz) do { } while (0)
13849 +#define inc_highmem_size(pagedir) do { } while (0)
13850 +#define get_lowmem_size(pagedir) (pagedir.size)
13851 +#endif
13852 +
13853 +extern struct pagedir pagedir1, pagedir2;
13854 +
13855 +extern void toi_copy_pageset1(void);
13856 +
13857 +extern int toi_get_pageset1_load_addresses(void);
13858 +
13859 +extern unsigned long __toi_get_nonconflicting_page(void);
13860 +struct page *___toi_get_nonconflicting_page(int can_be_highmem);
13861 +
13862 +extern void toi_reset_alt_image_pageset2_pfn(void);
13863 +extern int add_boot_kernel_data_pbe(void);
13864 +#endif
13865 diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
13866 new file mode 100644
13867 index 0000000..574858c
13868 --- /dev/null
13869 +++ b/kernel/power/tuxonice_pageflags.c
13870 @@ -0,0 +1,162 @@
13871 +/*
13872 + * kernel/power/tuxonice_pageflags.c
13873 + *
13874 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
13875 + *
13876 + * This file is released under the GPLv2.
13877 + *
13878 + * Routines for serialising and relocating pageflags in which we
13879 + * store our image metadata.
13880 + */
13881 +
13882 +#include <linux/kernel.h>
13883 +#include <linux/mm.h>
13884 +#include <linux/module.h>
13885 +#include <linux/bitops.h>
13886 +#include <linux/list.h>
13887 +#include <linux/suspend.h>
13888 +#include "tuxonice_pageflags.h"
13889 +#include "tuxonice_modules.h"
13890 +#include "tuxonice_pagedir.h"
13891 +#include "tuxonice.h"
13892 +
13893 +DECLARE_DYN_PAGEFLAGS(pageset2_map);
13894 +DECLARE_DYN_PAGEFLAGS(page_resave_map);
13895 +DECLARE_DYN_PAGEFLAGS(io_map);
13896 +DECLARE_DYN_PAGEFLAGS(nosave_map);
13897 +DECLARE_DYN_PAGEFLAGS(free_map);
13898 +
13899 +static int pages_for_zone(struct zone *zone)
13900 +{
13901 +       return DIV_ROUND_UP(zone->spanned_pages, (PAGE_SIZE << 3));
13902 +}
13903 +
13904 +int toi_pageflags_space_needed(void)
13905 +{
13906 +       int total = 0;
13907 +       struct zone *zone;
13908 +
13909 +       for_each_zone(zone)
13910 +               if (populated_zone(zone))
13911 +                       total += sizeof(int) * 3 + pages_for_zone(zone) *
13912 +                               PAGE_SIZE;
13913 +
13914 +       total += sizeof(int);
13915 +
13916 +       return total;
13917 +}
13918 +
13919 +/* save_dyn_pageflags
13920 + *
13921 + * Description: Save a set of pageflags.
13922 + * Arguments:   struct dyn_pageflags *: Pointer to the bitmap being saved.
13923 + */
13924 +
13925 +void save_dyn_pageflags(struct dyn_pageflags *pagemap)
13926 +{
13927 +       int i, zone_idx, size, node = 0;
13928 +       struct zone *zone;
13929 +       struct pglist_data *pgdat;
13930 +
13931 +       if (!pagemap)
13932 +               return;
13933 +
13934 +       for_each_online_pgdat(pgdat) {
13935 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
13936 +                       zone = &pgdat->node_zones[zone_idx];
13937 +
13938 +                       if (!populated_zone(zone))
13939 +                               continue;
13940 +
13941 +                       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
13942 +                                       (char *) &node, sizeof(int));
13943 +                       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
13944 +                                       (char *) &zone_idx, sizeof(int));
13945 +                       size = pages_for_zone(zone);
13946 +                       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
13947 +                                       (char *) &size, sizeof(int));
13948 +
13949 +                       for (i = 0; i < size; i++) {
13950 +                               if (!pagemap->bitmap[node][zone_idx][i+2]) {
13951 +                                       printk(KERN_INFO "Sparse pagemap?\n");
13952 +                                       dump_pagemap(pagemap);
13953 +                                       BUG();
13954 +                               }
13955 +                               toiActiveAllocator->rw_header_chunk(WRITE,
13956 +                                       NULL, (char *) pagemap->bitmap[node]
13957 +                                               [zone_idx][i+2],
13958 +                                       PAGE_SIZE);
13959 +                       }
13960 +               }
13961 +               node++;
13962 +       }
13963 +       node = -1;
13964 +       toiActiveAllocator->rw_header_chunk(WRITE, NULL,
13965 +                       (char *) &node, sizeof(int));
13966 +}
13967 +
13968 +/* load_dyn_pageflags
13969 + *
13970 + * Description: Load a set of pageflags.
13971 + * Arguments:   struct dyn_pageflags *: Pointer to the bitmap being loaded.
13972 + *              (It must be allocated before calling this routine).
13973 + */
13974 +
13975 +int load_dyn_pageflags(struct dyn_pageflags *pagemap)
13976 +{
13977 +       int i, zone_idx, zone_check = 0, size, node = 0;
13978 +       struct zone *zone;
13979 +       struct pglist_data *pgdat;
13980 +
13981 +       if (!pagemap)
13982 +               return 1;
13983 +
13984 +       for_each_online_pgdat(pgdat) {
13985 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
13986 +                       zone = &pgdat->node_zones[zone_idx];
13987 +
13988 +                       if (!populated_zone(zone))
13989 +                               continue;
13990 +
13991 +                       /* Same node? */
13992 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
13993 +                                       (char *) &zone_check, sizeof(int));
13994 +                       if (zone_check != node) {
13995 +                               printk(KERN_INFO "Node read (%d) != node "
13996 +                                               "(%d).\n",
13997 +                                               zone_check, node);
13998 +                               return 1;
13999 +                       }
14000 +
14001 +                       /* Same zone? */
14002 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
14003 +                                       (char *) &zone_check, sizeof(int));
14004 +                       if (zone_check != zone_idx) {
14005 +                               printk(KERN_INFO "Zone read (%d) != node "
14006 +                                               "(%d).\n",
14007 +                                               zone_check, zone_idx);
14008 +                               return 1;
14009 +                       }
14010 +
14011 +
14012 +                       toiActiveAllocator->rw_header_chunk(READ, NULL,
14013 +                               (char *) &size, sizeof(int));
14014 +
14015 +                       for (i = 0; i < size; i++)
14016 +                               toiActiveAllocator->rw_header_chunk(READ, NULL,
14017 +                                       (char *) pagemap->bitmap[node][zone_idx]
14018 +                                                                       [i+2],
14019 +                                       PAGE_SIZE);
14020 +               }
14021 +               node++;
14022 +       }
14023 +       toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &zone_check,
14024 +                       sizeof(int));
14025 +       if (zone_check != -1) {
14026 +               printk(KERN_INFO "Didn't read end of dyn pageflag data marker."
14027 +                               "(%x)\n", zone_check);
14028 +               return 1;
14029 +       }
14030 +
14031 +       return 0;
14032 +}
14033 diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
14034 new file mode 100644
14035 index 0000000..f976b5c
14036 --- /dev/null
14037 +++ b/kernel/power/tuxonice_pageflags.h
14038 @@ -0,0 +1,63 @@
14039 +/*
14040 + * kernel/power/tuxonice_pageflags.h
14041 + *
14042 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
14043 + *
14044 + * This file is released under the GPLv2.
14045 + *
14046 + * TuxOnIce needs a few pageflags while working that aren't otherwise
14047 + * used. To save the struct page pageflags, we dynamically allocate
14048 + * a bitmap and use that. These are the only non order-0 allocations
14049 + * we do.
14050 + *
14051 + * NOTE!!!
14052 + * We assume that PAGE_SIZE - sizeof(void *) is a multiple of
14053 + * sizeof(unsigned long). Is this ever false?
14054 + */
14055 +
14056 +#include <linux/dyn_pageflags.h>
14057 +#include <linux/suspend.h>
14058 +
14059 +extern struct dyn_pageflags pageset1_map;
14060 +extern struct dyn_pageflags pageset1_copy_map;
14061 +extern struct dyn_pageflags pageset2_map;
14062 +extern struct dyn_pageflags page_resave_map;
14063 +extern struct dyn_pageflags io_map;
14064 +extern struct dyn_pageflags nosave_map;
14065 +extern struct dyn_pageflags free_map;
14066 +
14067 +#define PagePageset1(page) (test_dynpageflag(&pageset1_map, page))
14068 +#define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page))
14069 +#define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page))
14070 +
14071 +#define PagePageset1Copy(page) (test_dynpageflag(&pageset1_copy_map, page))
14072 +#define SetPagePageset1Copy(page) (set_dynpageflag(&pageset1_copy_map, page))
14073 +#define ClearPagePageset1Copy(page) \
14074 +       (clear_dynpageflag(&pageset1_copy_map, page))
14075 +
14076 +#define PagePageset2(page) (test_dynpageflag(&pageset2_map, page))
14077 +#define SetPagePageset2(page) (set_dynpageflag(&pageset2_map, page))
14078 +#define ClearPagePageset2(page) (clear_dynpageflag(&pageset2_map, page))
14079 +
14080 +#define PageWasRW(page) (test_dynpageflag(&pageset2_map, page))
14081 +#define SetPageWasRW(page) (set_dynpageflag(&pageset2_map, page))
14082 +#define ClearPageWasRW(page) (clear_dynpageflag(&pageset2_map, page))
14083 +
14084 +#define PageResave(page) (page_resave_map.bitmap ? \
14085 +       test_dynpageflag(&page_resave_map, page) : 0)
14086 +#define SetPageResave(page) (set_dynpageflag(&page_resave_map, page))
14087 +#define ClearPageResave(page) (clear_dynpageflag(&page_resave_map, page))
14088 +
14089 +#define PageNosave(page) (nosave_map.bitmap ? \
14090 +               test_dynpageflag(&nosave_map, page) : 0)
14091 +#define SetPageNosave(page) (set_dynpageflag(&nosave_map, page))
14092 +#define ClearPageNosave(page) (clear_dynpageflag(&nosave_map, page))
14093 +
14094 +#define PageNosaveFree(page) (free_map.bitmap ? \
14095 +               test_dynpageflag(&free_map, page) : 0)
14096 +#define SetPageNosaveFree(page) (set_dynpageflag(&free_map, page))
14097 +#define ClearPageNosaveFree(page) (clear_dynpageflag(&free_map, page))
14098 +
14099 +extern void save_dyn_pageflags(struct dyn_pageflags *pagemap);
14100 +extern int load_dyn_pageflags(struct dyn_pageflags *pagemap);
14101 +extern int toi_pageflags_space_needed(void);
14102 diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
14103 new file mode 100644
14104 index 0000000..e35736f
14105 --- /dev/null
14106 +++ b/kernel/power/tuxonice_power_off.c
14107 @@ -0,0 +1,284 @@
14108 +/*
14109 + * kernel/power/tuxonice_power_off.c
14110 + *
14111 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
14112 + *
14113 + * This file is released under the GPLv2.
14114 + *
14115 + * Support for powering down.
14116 + */
14117 +
14118 +#include <linux/device.h>
14119 +#include <linux/suspend.h>
14120 +#include <linux/mm.h>
14121 +#include <linux/pm.h>
14122 +#include <linux/reboot.h>
14123 +#include <linux/cpu.h>
14124 +#include <linux/console.h>
14125 +#include <linux/fs.h>
14126 +#include "tuxonice.h"
14127 +#include "tuxonice_ui.h"
14128 +#include "tuxonice_power_off.h"
14129 +#include "tuxonice_sysfs.h"
14130 +#include "tuxonice_modules.h"
14131 +
14132 +unsigned long toi_poweroff_method; /* 0 - Kernel power off */
14133 +EXPORT_SYMBOL_GPL(toi_poweroff_method);
14134 +
14135 +int wake_delay;
14136 +static char lid_state_file[256], wake_alarm_dir[256];
14137 +static struct file *lid_file, *alarm_file, *epoch_file;
14138 +int post_wake_state = -1;
14139 +
14140 +static int did_suspend_to_both;
14141 +
14142 +/*
14143 + * __toi_power_down
14144 + * Functionality   : Powers down or reboots the computer once the image
14145 + *                   has been written to disk.
14146 + * Key Assumptions : Able to reboot/power down via code called or that
14147 + *                   the warning emitted if the calls fail will be visible
14148 + *                   to the user (ie printk resumes devices).
14149 + */
14150 +
14151 +static void __toi_power_down(int method)
14152 +{
14153 +       int error;
14154 +
14155 +       if (test_action_state(TOI_REBOOT)) {
14156 +               toi_cond_pause(1, "Ready to reboot.");
14157 +               kernel_restart(NULL);
14158 +       }
14159 +
14160 +       toi_cond_pause(1, "Powering down.");
14161 +
14162 +       switch (method) {
14163 +       case 0:
14164 +               break;
14165 +       case 3:
14166 +               error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
14167 +               if (!error) {
14168 +                       error = suspend_devices_and_enter(PM_SUSPEND_MEM);
14169 +                       if (!error)
14170 +                               did_suspend_to_both = 1;
14171 +               }
14172 +               pm_notifier_call_chain(PM_POST_SUSPEND);
14173 +
14174 +               /* Success - we're now post-resume-from-ram */
14175 +               if (did_suspend_to_both)
14176 +                       return;
14177 +
14178 +               /* Failed to suspend to ram - do normal power off */
14179 +               break;
14180 +       case 4:
14181 +               /* 
14182 +                * If succeeds, doesn't return. If fails, do a simple
14183 +                * powerdown.
14184 +                */
14185 +               hibernation_platform_enter();
14186 +               break;
14187 +       case 5:
14188 +               /* Historic entry only now */
14189 +               break;
14190 +       }
14191 +
14192 +       if (method && method != 5)
14193 +               toi_cond_pause(1,
14194 +                       "Falling back to alternate power off method.");
14195 +
14196 +       if (test_result_state(TOI_ABORTED))
14197 +               return;
14198 +
14199 +       kernel_power_off();
14200 +       kernel_halt();
14201 +       toi_cond_pause(1, "Powerdown failed.");
14202 +       while (1)
14203 +               cpu_relax();
14204 +}
14205 +
14206 +#define CLOSE_FILE(file) \
14207 +       if (file) { \
14208 +               filp_close(file, NULL); file = NULL; \
14209 +       }
14210 +
14211 +static void powerdown_cleanup(int toi_or_resume)
14212 +{
14213 +       if (!toi_or_resume)
14214 +               return;
14215 +
14216 +       CLOSE_FILE(lid_file);
14217 +       CLOSE_FILE(alarm_file);
14218 +       CLOSE_FILE(epoch_file);
14219 +}
14220 +
14221 +static void open_file(char *format, char *arg, struct file **var, int mode,
14222 +               char *desc)
14223 +{
14224 +       char buf[256];
14225 +
14226 +       if (strlen(arg)) {
14227 +               sprintf(buf, format, arg);
14228 +               *var = filp_open(buf, mode, 0);
14229 +               if (IS_ERR(*var) || !*var) {
14230 +                       printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
14231 +                               desc, buf, *var);
14232 +                       *var = 0;
14233 +               }
14234 +       }
14235 +}
14236 +
14237 +static int powerdown_init(int toi_or_resume)
14238 +{
14239 +       if (!toi_or_resume)
14240 +               return 0;
14241 +
14242 +       did_suspend_to_both = 0;
14243 +
14244 +       open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
14245 +                       O_RDONLY, "lid");
14246 +
14247 +       if (strlen(wake_alarm_dir)) {
14248 +               open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
14249 +                               &alarm_file, O_WRONLY, "alarm");
14250 +
14251 +               open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
14252 +                               &epoch_file, O_RDONLY, "epoch");
14253 +       }
14254 +
14255 +       return 0;
14256 +}
14257 +
14258 +static int lid_closed(void)
14259 +{
14260 +       char array[25];
14261 +       ssize_t size;
14262 +       loff_t pos = 0;
14263 +
14264 +       if (!lid_file)
14265 +               return 0;
14266 +
14267 +       size = vfs_read(lid_file, (char __user *) array, 25, &pos);
14268 +       if ((int) size < 1) {
14269 +               printk(KERN_INFO "Failed to read lid state file (%d).\n",
14270 +                       (int) size);
14271 +               return 0;
14272 +       }
14273 +
14274 +       if (!strcmp(array, "state:      closed\n"))
14275 +               return 1;
14276 +
14277 +       return 0;
14278 +}
14279 +
14280 +static void write_alarm_file(int value)
14281 +{
14282 +       ssize_t size;
14283 +       char buf[40];
14284 +       loff_t pos = 0;
14285 +
14286 +       if (!alarm_file)
14287 +               return;
14288 +
14289 +       sprintf(buf, "%d\n", value);
14290 +
14291 +       size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
14292 +
14293 +       if (size < 0)
14294 +               printk(KERN_INFO "Error %d writing alarm value %s.\n",
14295 +                               (int) size, buf);
14296 +}
14297 +
14298 +/**
14299 + * toi_check_resleep: See whether to powerdown again after waking.
14300 + *
14301 + * After waking, check whether we should powerdown again in a (usually
14302 + * different) way. We only do this if the lid switch is still closed.
14303 + */
14304 +void toi_check_resleep(void)
14305 +{
14306 +       /* We only return if we suspended to ram and woke. */
14307 +       if (lid_closed() && post_wake_state >= 0)
14308 +               __toi_power_down(post_wake_state);
14309 +}
14310 +
14311 +void toi_power_down(void)
14312 +{
14313 +       if (alarm_file && wake_delay) {
14314 +               char array[25];
14315 +               loff_t pos = 0;
14316 +               size_t size = vfs_read(epoch_file, (char __user *) array, 25,
14317 +                               &pos);
14318 +
14319 +               if (((int) size) < 1)
14320 +                       printk(KERN_INFO "Failed to read epoch file (%d).\n",
14321 +                                       (int) size);
14322 +               else {
14323 +                       unsigned long since_epoch =
14324 +                               simple_strtol(array, NULL, 0);
14325 +
14326 +                       /* Clear any wakeup time. */
14327 +                       write_alarm_file(0);
14328 +
14329 +                       /* Set new wakeup time. */
14330 +                       write_alarm_file(since_epoch + wake_delay);
14331 +               }
14332 +       }
14333 +
14334 +       __toi_power_down(toi_poweroff_method);
14335 +
14336 +       toi_check_resleep();
14337 +}
14338 +EXPORT_SYMBOL_GPL(toi_power_down);
14339 +
14340 +static struct toi_sysfs_data sysfs_params[] = {
14341 +#if defined(CONFIG_ACPI)
14342 +       {
14343 +        TOI_ATTR("lid_file", SYSFS_RW),
14344 +        SYSFS_STRING(lid_state_file, 256, 0),
14345 +       },
14346 +
14347 +       {
14348 +         TOI_ATTR("wake_delay", SYSFS_RW),
14349 +         SYSFS_INT(&wake_delay, 0, INT_MAX, 0)
14350 +       },
14351 +
14352 +       {
14353 +         TOI_ATTR("wake_alarm_dir", SYSFS_RW),
14354 +         SYSFS_STRING(wake_alarm_dir, 256, 0)
14355 +       },
14356 +
14357 +       { TOI_ATTR("post_wake_state", SYSFS_RW),
14358 +         SYSFS_INT(&post_wake_state, -1, 5, 0)
14359 +       },
14360 +
14361 +       { TOI_ATTR("powerdown_method", SYSFS_RW),
14362 +         SYSFS_UL(&toi_poweroff_method, 0, 5, 0)
14363 +       },
14364 +
14365 +       { TOI_ATTR("did_suspend_to_both", SYSFS_READONLY),
14366 +         SYSFS_INT(&did_suspend_to_both, 0, 0, 0)
14367 +       },
14368 +#endif
14369 +};
14370 +
14371 +static struct toi_module_ops powerdown_ops = {
14372 +       .type                           = MISC_HIDDEN_MODULE,
14373 +       .name                           = "poweroff",
14374 +       .initialise                     = powerdown_init,
14375 +       .cleanup                        = powerdown_cleanup,
14376 +       .directory                      = "[ROOT]",
14377 +       .module                         = THIS_MODULE,
14378 +       .sysfs_data                     = sysfs_params,
14379 +       .num_sysfs_entries              = sizeof(sysfs_params) /
14380 +               sizeof(struct toi_sysfs_data),
14381 +};
14382 +
14383 +int toi_poweroff_init(void)
14384 +{
14385 +       return toi_register_module(&powerdown_ops);
14386 +}
14387 +
14388 +void toi_poweroff_exit(void)
14389 +{
14390 +       toi_unregister_module(&powerdown_ops);
14391 +}
14392 diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
14393 new file mode 100644
14394 index 0000000..bd2a46f
14395 --- /dev/null
14396 +++ b/kernel/power/tuxonice_power_off.h
14397 @@ -0,0 +1,34 @@
14398 +/*
14399 + * kernel/power/tuxonice_power_off.h
14400 + *
14401 + * Copyright (C) 2006-2007 Nigel Cunningham (nigel at tuxonice net)
14402 + *
14403 + * This file is released under the GPLv2.
14404 + *
14405 + * Support for the powering down.
14406 + */
14407 +
14408 +int toi_pm_state_finish(void);
14409 +void toi_power_down(void);
14410 +extern unsigned long toi_poweroff_method;
14411 +extern int toi_platform_prepare(void);
14412 +int toi_poweroff_init(void);
14413 +void toi_poweroff_exit(void);
14414 +void toi_check_resleep(void);
14415 +
14416 +extern int platform_begin(int platform_mode);
14417 +extern int platform_pre_snapshot(int platform_mode);
14418 +extern int platform_leave(int platform_mode);
14419 +extern int platform_end(int platform_mode);
14420 +extern int platform_finish(int platform_mode);
14421 +extern int platform_pre_restore(int platform_mode);
14422 +extern int platform_restore_cleanup(int platform_mode);
14423 +
14424 +#define platform_test() (toi_poweroff_method == 4)
14425 +#define toi_platform_begin() platform_begin(platform_test())
14426 +#define toi_platform_pre_snapshot() platform_pre_snapshot(platform_test())
14427 +#define toi_platform_leave() platform_leave(platform_test())
14428 +#define toi_platform_end() platform_end(platform_test())
14429 +#define toi_platform_finish() platform_finish(platform_test())
14430 +#define toi_platform_pre_restore() platform_pre_restore(platform_test())
14431 +#define toi_platform_restore_cleanup() platform_restore_cleanup(platform_test())
14432 diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
14433 new file mode 100644
14434 index 0000000..68f2fd3
14435 --- /dev/null
14436 +++ b/kernel/power/tuxonice_prepare_image.c
14437 @@ -0,0 +1,1056 @@
14438 +/*
14439 + * kernel/power/tuxonice_prepare_image.c
14440 + *
14441 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
14442 + *
14443 + * This file is released under the GPLv2.
14444 + *
14445 + * We need to eat memory until we can:
14446 + * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
14447 + * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
14448 + *    main_storage_needed())
14449 + * 3. Reload the pagedir and pageset1 to places that don't collide with their
14450 + *    final destinations, not knowing to what extent the resumed kernel will
14451 + *    overlap with the one loaded at boot time. I think the resumed kernel
14452 + *    should overlap completely, but I don't want to rely on this as it is
14453 + *    an unproven assumption. We therefore assume there will be no overlap at
14454 + *    all (worse case).
14455 + * 4. Meet the user's requested limit (if any) on the size of the image.
14456 + *    The limit is in MB, so pages/256 (assuming 4K pages).
14457 + *
14458 + */
14459 +
14460 +#include <linux/module.h>
14461 +#include <linux/highmem.h>
14462 +#include <linux/freezer.h>
14463 +#include <linux/hardirq.h>
14464 +#include <linux/mmzone.h>
14465 +#include <linux/console.h>
14466 +
14467 +#include "tuxonice_pageflags.h"
14468 +#include "tuxonice_modules.h"
14469 +#include "tuxonice_io.h"
14470 +#include "tuxonice_ui.h"
14471 +#include "tuxonice_extent.h"
14472 +#include "tuxonice_prepare_image.h"
14473 +#include "tuxonice_block_io.h"
14474 +#include "tuxonice.h"
14475 +#include "tuxonice_checksum.h"
14476 +#include "tuxonice_sysfs.h"
14477 +#include "tuxonice_alloc.h"
14478 +
14479 +static long num_nosave, header_space_allocated, main_storage_allocated,
14480 +          storage_available;
14481 +long extra_pd1_pages_allowance = CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
14482 +int image_size_limit;
14483 +
14484 +struct attention_list {
14485 +       struct task_struct *task;
14486 +       struct attention_list *next;
14487 +};
14488 +
14489 +static struct attention_list *attention_list;
14490 +
14491 +#define PAGESET1 0
14492 +#define PAGESET2 1
14493 +
14494 +void free_attention_list(void)
14495 +{
14496 +       struct attention_list *last = NULL;
14497 +
14498 +       while (attention_list) {
14499 +               last = attention_list;
14500 +               attention_list = attention_list->next;
14501 +               toi_kfree(6, last);
14502 +       }
14503 +}
14504 +
14505 +static int build_attention_list(void)
14506 +{
14507 +       int i, task_count = 0;
14508 +       struct task_struct *p;
14509 +       struct attention_list *next;
14510 +
14511 +       /*
14512 +        * Count all userspace process (with task->mm) marked PF_NOFREEZE.
14513 +        */
14514 +       read_lock(&tasklist_lock);
14515 +       for_each_process(p)
14516 +               if ((p->flags & PF_NOFREEZE) || p == current)
14517 +                       task_count++;
14518 +       read_unlock(&tasklist_lock);
14519 +
14520 +       /*
14521 +        * Allocate attention list structs.
14522 +        */
14523 +       for (i = 0; i < task_count; i++) {
14524 +               struct attention_list *this =
14525 +                       toi_kzalloc(6, sizeof(struct attention_list),
14526 +                                       TOI_WAIT_GFP);
14527 +               if (!this) {
14528 +                       printk(KERN_INFO "Failed to allocate slab for "
14529 +                                       "attention list.\n");
14530 +                       free_attention_list();
14531 +                       return 1;
14532 +               }
14533 +               this->next = NULL;
14534 +               if (attention_list)
14535 +                       this->next = attention_list;
14536 +               attention_list = this;
14537 +       }
14538 +
14539 +       next = attention_list;
14540 +       read_lock(&tasklist_lock);
14541 +       for_each_process(p)
14542 +               if ((p->flags & PF_NOFREEZE) || p == current) {
14543 +                       next->task = p;
14544 +                       next = next->next;
14545 +               }
14546 +       read_unlock(&tasklist_lock);
14547 +       return 0;
14548 +}
14549 +
14550 +static void pageset2_full(void)
14551 +{
14552 +       struct zone *zone;
14553 +       unsigned long flags;
14554 +
14555 +       for_each_zone(zone) {
14556 +               spin_lock_irqsave(&zone->lru_lock, flags);
14557 +               if (zone_page_state(zone, NR_INACTIVE)) {
14558 +                       struct page *page;
14559 +                       list_for_each_entry(page, &zone->inactive_list, lru)
14560 +                               SetPagePageset2(page);
14561 +               }
14562 +               if (zone_page_state(zone, NR_ACTIVE)) {
14563 +                       struct page *page;
14564 +                       list_for_each_entry(page, &zone->active_list, lru)
14565 +                               SetPagePageset2(page);
14566 +               }
14567 +               spin_unlock_irqrestore(&zone->lru_lock, flags);
14568 +       }
14569 +}
14570 +
14571 +/*
14572 + * toi_mark_task_as_pageset
14573 + * Functionality   : Marks all the saveable pages belonging to a given process
14574 + *                  as belonging to a particular pageset.
14575 + */
14576 +
14577 +static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
14578 +{
14579 +       struct vm_area_struct *vma;
14580 +       struct mm_struct *mm;
14581 +
14582 +       mm = t->active_mm;
14583 +
14584 +       if (!mm || !mm->mmap)
14585 +               return;
14586 +
14587 +       if (!irqs_disabled())
14588 +               down_read(&mm->mmap_sem);
14589 +
14590 +       for (vma = mm->mmap; vma; vma = vma->vm_next) {
14591 +               unsigned long posn;
14592 +
14593 +               if (vma->vm_flags & (VM_PFNMAP | VM_IO | VM_RESERVED) ||
14594 +                   !vma->vm_start)
14595 +                       continue;
14596 +
14597 +               for (posn = vma->vm_start; posn < vma->vm_end;
14598 +                               posn += PAGE_SIZE) {
14599 +                       struct page *page = follow_page(vma, posn, 0);
14600 +                       if (!page)
14601 +                               continue;
14602 +
14603 +                       if (pageset2)
14604 +                               SetPagePageset2(page);
14605 +                       else {
14606 +                               ClearPagePageset2(page);
14607 +                               SetPagePageset1(page);
14608 +                       }
14609 +               }
14610 +       }
14611 +
14612 +       if (!irqs_disabled())
14613 +               up_read(&mm->mmap_sem);
14614 +}
14615 +
14616 +/* mark_pages_for_pageset2
14617 + *
14618 + * Description:        Mark unshared pages in processes not needed for hibernate as
14619 + *             being able to be written out in a separate pagedir.
14620 + *             HighMem pages are simply marked as pageset2. They won't be
14621 + *             needed during hibernate.
14622 + */
14623 +
14624 +static void toi_mark_pages_for_pageset2(void)
14625 +{
14626 +       struct task_struct *p;
14627 +       struct attention_list *this = attention_list;
14628 +
14629 +       if (test_action_state(TOI_NO_PAGESET2))
14630 +               return;
14631 +
14632 +       clear_dyn_pageflags(&pageset2_map);
14633 +
14634 +       if (test_action_state(TOI_PAGESET2_FULL))
14635 +               pageset2_full();
14636 +       else {
14637 +               read_lock(&tasklist_lock);
14638 +               for_each_process(p) {
14639 +                       if (!p->mm || (p->flags & PF_KTHREAD))
14640 +                               continue;
14641 +
14642 +                       toi_mark_task_as_pageset(p, PAGESET2);
14643 +               }
14644 +               read_unlock(&tasklist_lock);
14645 +       }
14646 +
14647 +       /*
14648 +        * Because the tasks in attention_list are ones related to hibernating,
14649 +        * we know that they won't go away under us.
14650 +        */
14651 +
14652 +       while (this) {
14653 +               if (!test_result_state(TOI_ABORTED))
14654 +                       toi_mark_task_as_pageset(this->task, PAGESET1);
14655 +               this = this->next;
14656 +       }
14657 +}
14658 +
14659 +/*
14660 + * The atomic copy of pageset1 is stored in pageset2 pages.
14661 + * But if pageset1 is larger (normally only just after boot),
14662 + * we need to allocate extra pages to store the atomic copy.
14663 + * The following data struct and functions are used to handle
14664 + * the allocation and freeing of that memory.
14665 + */
14666 +
14667 +static long extra_pages_allocated;
14668 +
14669 +struct extras {
14670 +       struct page *page;
14671 +       int order;
14672 +       struct extras *next;
14673 +};
14674 +
14675 +static struct extras *extras_list;
14676 +
14677 +/* toi_free_extra_pagedir_memory
14678 + *
14679 + * Description:        Free previously allocated extra pagedir memory.
14680 + */
14681 +void toi_free_extra_pagedir_memory(void)
14682 +{
14683 +       /* Free allocated pages */
14684 +       while (extras_list) {
14685 +               struct extras *this = extras_list;
14686 +               int i;
14687 +
14688 +               extras_list = this->next;
14689 +
14690 +               for (i = 0; i < (1 << this->order); i++)
14691 +                       ClearPageNosave(this->page + i);
14692 +
14693 +               toi_free_pages(9, this->page, this->order);
14694 +               toi_kfree(7, this);
14695 +       }
14696 +
14697 +       extra_pages_allocated = 0;
14698 +}
14699 +
14700 +/* toi_allocate_extra_pagedir_memory
14701 + *
14702 + * Description:        Allocate memory for making the atomic copy of pagedir1 in the
14703 + *             case where it is bigger than pagedir2.
14704 + * Arguments:  int     num_to_alloc: Number of extra pages needed.
14705 + * Result:     int.    Number of extra pages we now have allocated.
14706 + */
14707 +static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
14708 +{
14709 +       int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
14710 +       unsigned long flags = TOI_ATOMIC_GFP;
14711 +
14712 +       if (num_to_alloc < 1)
14713 +               return 0;
14714 +
14715 +       order = fls(num_to_alloc);
14716 +       if (order >= MAX_ORDER)
14717 +               order = MAX_ORDER - 1;
14718 +
14719 +       while (num_to_alloc) {
14720 +               struct page *newpage;
14721 +               unsigned long virt;
14722 +               struct extras *extras_entry;
14723 +
14724 +               while ((1 << order) > num_to_alloc)
14725 +                       order--;
14726 +
14727 +               extras_entry = (struct extras *) toi_kzalloc(7,
14728 +                       sizeof(struct extras), TOI_ATOMIC_GFP);
14729 +
14730 +               if (!extras_entry)
14731 +                       return extra_pages_allocated;
14732 +
14733 +               virt = toi_get_free_pages(9, flags, order);
14734 +               while (!virt && order) {
14735 +                       order--;
14736 +                       virt = toi_get_free_pages(9, flags, order);
14737 +               }
14738 +
14739 +               if (!virt) {
14740 +                       toi_kfree(7, extras_entry);
14741 +                       return extra_pages_allocated;
14742 +               }
14743 +
14744 +               newpage = virt_to_page(virt);
14745 +
14746 +               extras_entry->page = newpage;
14747 +               extras_entry->order = order;
14748 +               extras_entry->next = NULL;
14749 +
14750 +               if (extras_list)
14751 +                       extras_entry->next = extras_list;
14752 +
14753 +               extras_list = extras_entry;
14754 +
14755 +               for (j = 0; j < (1 << order); j++) {
14756 +                       SetPageNosave(newpage + j);
14757 +                       SetPagePageset1Copy(newpage + j);
14758 +               }
14759 +
14760 +               extra_pages_allocated += (1 << order);
14761 +               num_to_alloc -= (1 << order);
14762 +       }
14763 +
14764 +       return extra_pages_allocated;
14765 +}
14766 +
14767 +/*
14768 + * real_nr_free_pages: Count pcp pages for a zone type or all zones
14769 + * (-1 for all, otherwise zone_idx() result desired).
14770 + */
14771 +long real_nr_free_pages(unsigned long zone_idx_mask)
14772 +{
14773 +       struct zone *zone;
14774 +       int result = 0, cpu;
14775 +
14776 +       /* PCP lists */
14777 +       for_each_zone(zone) {
14778 +               if (!populated_zone(zone))
14779 +                       continue;
14780 +
14781 +               if (!(zone_idx_mask & (1 << zone_idx(zone))))
14782 +                       continue;
14783 +
14784 +               for_each_online_cpu(cpu) {
14785 +                       struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
14786 +                       struct per_cpu_pages *pcp = &pset->pcp;
14787 +                       result += pcp->count;
14788 +               }
14789 +
14790 +               result += zone_page_state(zone, NR_FREE_PAGES);
14791 +       }
14792 +       return result;
14793 +}
14794 +
14795 +/*
14796 + * Discover how much extra memory will be required by the drivers
14797 + * when they're asked to hibernate. We can then ensure that amount
14798 + * of memory is available when we really want it.
14799 + */
14800 +static void get_extra_pd1_allowance(void)
14801 +{
14802 +       long orig_num_free = real_nr_free_pages(all_zones_mask), final;
14803 +
14804 +       toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
14805 +
14806 +       suspend_console();
14807 +       device_suspend(PMSG_FREEZE);
14808 +       device_pm_lock();
14809 +       local_irq_disable(); /* irqs might have been re-enabled on us */
14810 +       device_power_down(PMSG_FREEZE);
14811 +
14812 +       final = real_nr_free_pages(all_zones_mask);
14813 +
14814 +       device_power_up(PMSG_THAW);
14815 +       local_irq_enable();
14816 +       device_pm_unlock();
14817 +       device_resume(PMSG_THAW);
14818 +       resume_console();
14819 +
14820 +       extra_pd1_pages_allowance = max(
14821 +               orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE,
14822 +               (long) MIN_EXTRA_PAGES_ALLOWANCE);
14823 +}
14824 +
14825 +/*
14826 + * Amount of storage needed, possibly taking into account the
14827 + * expected compression ratio and possibly also ignoring our
14828 + * allowance for extra pages.
14829 + */
14830 +static long main_storage_needed(int use_ecr,
14831 +               int ignore_extra_pd1_allow)
14832 +{
14833 +       return (pagedir1.size + pagedir2.size +
14834 +         (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
14835 +        (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
14836 +}
14837 +
14838 +/*
14839 + * Storage needed for the image header, in bytes until the return.
14840 + */
14841 +static long header_storage_needed(void)
14842 +{
14843 +       long bytes = (int) sizeof(struct toi_header) +
14844 +                       toi_header_storage_for_modules() +
14845 +                       toi_pageflags_space_needed();
14846 +
14847 +       return DIV_ROUND_UP(bytes, PAGE_SIZE);
14848 +}
14849 +
14850 +/*
14851 + * When freeing memory, pages from either pageset might be freed.
14852 + *
14853 + * When seeking to free memory to be able to hibernate, for every ps1 page
14854 + * freed, we need 2 less pages for the atomic copy because there is one less
14855 + * page to copy and one more page into which data can be copied.
14856 + *
14857 + * Freeing ps2 pages saves us nothing directly. No more memory is available
14858 + * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
14859 + * that's too much work to figure out.
14860 + *
14861 + * => ps1_to_free functions
14862 + *
14863 + * Of course if we just want to reduce the image size, because of storage
14864 + * limitations or an image size limit either ps will do.
14865 + *
14866 + * => any_to_free function
14867 + */
14868 +
14869 +static long highpages_ps1_to_free(void)
14870 +{
14871 +       return max_t(long, 0, DIV_ROUND_UP(get_highmem_size(pagedir1) -
14872 +               get_highmem_size(pagedir2), 2) - real_nr_free_high_pages());
14873 +}
14874 +
14875 +static long lowpages_ps1_to_free(void)
14876 +{
14877 +       return max_t(long, 0, DIV_ROUND_UP(get_lowmem_size(pagedir1) +
14878 +               extra_pd1_pages_allowance + MIN_FREE_RAM +
14879 +               toi_memory_for_modules(0) - get_lowmem_size(pagedir2) -
14880 +               real_nr_free_low_pages() - extra_pages_allocated, 2));
14881 +}
14882 +
14883 +static long current_image_size(void)
14884 +{
14885 +       return pagedir1.size + pagedir2.size + header_space_allocated;
14886 +}
14887 +
14888 +static long storage_still_required(void)
14889 +{
14890 +       return max_t(long, 0, main_storage_needed(1, 1) - storage_available);
14891 +}
14892 +
14893 +static long ram_still_required(void)
14894 +{
14895 +       return max_t(long, 0, MIN_FREE_RAM + toi_memory_for_modules(0) -
14896 +               real_nr_free_low_pages() + 2 * extra_pd1_pages_allowance);
14897 +}
14898 +
14899 +static long any_to_free(int use_image_size_limit)
14900 +{
14901 +       long user_limit = (use_image_size_limit && image_size_limit > 0) ?
14902 +               max_t(long, 0, current_image_size() - (image_size_limit << 8))
14903 +               : 0;
14904 +
14905 +       long storage_limit = storage_still_required(),
14906 +           ram_limit = ram_still_required();
14907 +
14908 +       return max(max(user_limit, storage_limit), ram_limit);
14909 +}
14910 +
14911 +/* amount_needed
14912 + *
14913 + * Calculates the amount by which the image size needs to be reduced to meet
14914 + * our constraints.
14915 + */
14916 +static long amount_needed(int use_image_size_limit)
14917 +{
14918 +       return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
14919 +                       any_to_free(use_image_size_limit));
14920 +}
14921 +
14922 +static long image_not_ready(int use_image_size_limit)
14923 +{
14924 +       toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
14925 +               "Amount still needed (%ld) > 0:%d. Header: %ld < %ld: %d,"
14926 +               " Storage allocd: %ld < %ld: %d.\n",
14927 +                       amount_needed(use_image_size_limit),
14928 +                       (amount_needed(use_image_size_limit) > 0),
14929 +                       header_space_allocated, header_storage_needed(),
14930 +                       header_space_allocated < header_storage_needed(),
14931 +                       main_storage_allocated,
14932 +                       main_storage_needed(1, 1),
14933 +                       main_storage_allocated < main_storage_needed(1, 1));
14934 +
14935 +       toi_cond_pause(0, NULL);
14936 +
14937 +       return (amount_needed(use_image_size_limit) > 0) ||
14938 +               header_space_allocated < header_storage_needed() ||
14939 +                main_storage_allocated < main_storage_needed(1, 1);
14940 +}
14941 +
14942 +static void display_failure_reason(int tries_exceeded)
14943 +{
14944 +       long storage_required = storage_still_required(),
14945 +           ram_required = ram_still_required(),
14946 +           high_ps1 = highpages_ps1_to_free(),
14947 +           low_ps1 = lowpages_ps1_to_free();
14948 +
14949 +       printk(KERN_INFO "Failed to prepare the image because...\n");
14950 +
14951 +       if (!storage_available) {
14952 +               printk(KERN_INFO "- You need some storage available to be "
14953 +                               "able to hibernate.\n");
14954 +               return;
14955 +       }
14956 +
14957 +       if (tries_exceeded)
14958 +               printk(KERN_INFO "- The maximum number of iterations was "
14959 +                               "reached without successfully preparing the "
14960 +                               "image.\n");
14961 +
14962 +       if (header_space_allocated < header_storage_needed()) {
14963 +               printk(KERN_INFO "- Insufficient header storage allocated. "
14964 +                               "Need %ld, have %ld.\n",
14965 +                               header_storage_needed(),
14966 +                               header_space_allocated);
14967 +               set_abort_result(TOI_INSUFFICIENT_STORAGE);
14968 +       }
14969 +
14970 +       if (storage_required) {
14971 +               printk(KERN_INFO " - We need at least %ld pages of storage "
14972 +                               "(ignoring the header), but only have %ld.\n",
14973 +                               main_storage_needed(1, 1),
14974 +                               main_storage_allocated);
14975 +               set_abort_result(TOI_INSUFFICIENT_STORAGE);
14976 +       }
14977 +
14978 +       if (ram_required) {
14979 +               printk(KERN_INFO " - We need %ld more free pages of low "
14980 +                               "memory.\n", ram_required);
14981 +               printk(KERN_INFO "     Minimum free     : %8d\n", MIN_FREE_RAM);
14982 +               printk(KERN_INFO "   + Reqd. by modules : %8ld\n",
14983 +                               toi_memory_for_modules(0));
14984 +               printk(KERN_INFO "   - Currently free   : %8ld\n",
14985 +                               real_nr_free_low_pages());
14986 +               printk(KERN_INFO "   + 2 * extra allow  : %8ld\n",
14987 +                               2 * extra_pd1_pages_allowance);
14988 +               printk(KERN_INFO "                      : ========\n");
14989 +               printk(KERN_INFO "     Still needed     : %8ld\n",
14990 +                               ram_required);
14991 +
14992 +               /* Print breakdown of memory needed for modules */
14993 +               toi_memory_for_modules(1);
14994 +               set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
14995 +       }
14996 +
14997 +       if (high_ps1) {
14998 +               printk(KERN_INFO "- We need to free %ld highmem pageset 1 "
14999 +                               "pages.\n", high_ps1);
15000 +               set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
15001 +       }
15002 +
15003 +       if (low_ps1) {
15004 +               printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
15005 +                               "pages.\n", low_ps1);
15006 +               set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
15007 +       }
15008 +}
15009 +
15010 +static void display_stats(int always, int sub_extra_pd1_allow)
15011 +{
15012 +       char buffer[255];
15013 +       snprintf(buffer, 254,
15014 +               "Free:%ld(%ld). Sets:%ld(%ld),%ld(%ld). Header:%ld/%ld. "
15015 +               "Nosave:%ld-%ld=%ld. Storage:%lu/%lu(%lu=>%lu). "
15016 +               "Needed:%ld,%ld,%ld(%d,%ld,%ld,%ld)\n",
15017 +
15018 +               /* Free */
15019 +               real_nr_free_pages(all_zones_mask),
15020 +               real_nr_free_low_pages(),
15021 +
15022 +               /* Sets */
15023 +               pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
15024 +               pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
15025 +
15026 +               /* Header */
15027 +               header_space_allocated, header_storage_needed(),
15028 +
15029 +               /* Nosave */
15030 +               num_nosave, extra_pages_allocated,
15031 +               num_nosave - extra_pages_allocated,
15032 +
15033 +               /* Storage */
15034 +               main_storage_allocated,
15035 +               storage_available,
15036 +               main_storage_needed(1, sub_extra_pd1_allow),
15037 +               main_storage_needed(1, 1),
15038 +
15039 +               /* Needed */
15040 +               lowpages_ps1_to_free(), highpages_ps1_to_free(),
15041 +               any_to_free(1),
15042 +               MIN_FREE_RAM, toi_memory_for_modules(0),
15043 +               extra_pd1_pages_allowance, ((long) image_size_limit) << 8);
15044 +
15045 +       if (always)
15046 +               printk(buffer);
15047 +       else
15048 +               toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
15049 +}
15050 +
15051 +/* generate_free_page_map
15052 + *
15053 + * Description:        This routine generates a bitmap of free pages from the
15054 + *             lists used by the memory manager. We then use the bitmap
15055 + *             to quickly calculate which pages to save and in which
15056 + *             pagesets.
15057 + */
15058 +static void generate_free_page_map(void)
15059 +{
15060 +       int order, pfn, cpu, t;
15061 +       unsigned long flags, i;
15062 +       struct zone *zone;
15063 +       struct list_head *curr;
15064 +
15065 +       for_each_zone(zone) {
15066 +               if (!populated_zone(zone))
15067 +                       continue;
15068 +
15069 +               spin_lock_irqsave(&zone->lock, flags);
15070 +
15071 +               for (i = 0; i < zone->spanned_pages; i++)
15072 +                       ClearPageNosaveFree(pfn_to_page(
15073 +                                               zone->zone_start_pfn + i));
15074 +
15075 +               for_each_migratetype_order(order, t) {
15076 +                       list_for_each(curr,
15077 +                                       &zone->free_area[order].free_list[t]) {
15078 +                               unsigned long i;
15079 +
15080 +                               pfn = page_to_pfn(list_entry(curr, struct page,
15081 +                                                       lru));
15082 +                               for (i = 0; i < (1UL << order); i++)
15083 +                                       SetPageNosaveFree(pfn_to_page(pfn + i));
15084 +                       }
15085 +               }
15086 +
15087 +               for_each_online_cpu(cpu) {
15088 +                       struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
15089 +                       struct per_cpu_pages *pcp = &pset->pcp;
15090 +                       struct page *page;
15091 +
15092 +                       list_for_each_entry(page, &pcp->list, lru)
15093 +                               SetPageNosaveFree(page);
15094 +               }
15095 +
15096 +               spin_unlock_irqrestore(&zone->lock, flags);
15097 +       }
15098 +}
15099 +
15100 +/* size_of_free_region
15101 + *
15102 + * Description:        Return the number of pages that are free, beginning with and
15103 + *             including this one.
15104 + */
15105 +static int size_of_free_region(struct page *page)
15106 +{
15107 +       struct zone *zone = page_zone(page);
15108 +       unsigned long this_pfn = page_to_pfn(page),
15109 +                     orig_pfn = this_pfn,
15110 +                     end_pfn = zone->zone_start_pfn + zone->spanned_pages - 1;
15111 +
15112 +       while (this_pfn <= end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
15113 +               this_pfn++;
15114 +
15115 +       return this_pfn - orig_pfn;
15116 +}
15117 +
15118 +/* flag_image_pages
15119 + *
15120 + * This routine generates our lists of pages to be stored in each
15121 + * pageset. Since we store the data using extents, and adding new
15122 + * extents might allocate a new extent page, this routine may well
15123 + * be called more than once.
15124 + */
15125 +static void flag_image_pages(int atomic_copy)
15126 +{
15127 +       int num_free = 0;
15128 +       unsigned long loop;
15129 +       struct zone *zone;
15130 +
15131 +       pagedir1.size = 0;
15132 +       pagedir2.size = 0;
15133 +
15134 +       set_highmem_size(pagedir1, 0);
15135 +       set_highmem_size(pagedir2, 0);
15136 +
15137 +       num_nosave = 0;
15138 +
15139 +       clear_dyn_pageflags(&pageset1_map);
15140 +
15141 +       generate_free_page_map();
15142 +
15143 +       /*
15144 +        * Pages not to be saved are marked Nosave irrespective of being
15145 +        * reserved.
15146 +        */
15147 +       for_each_zone(zone) {
15148 +               int highmem = is_highmem(zone);
15149 +
15150 +               if (!populated_zone(zone))
15151 +                       continue;
15152 +
15153 +               for (loop = 0; loop < zone->spanned_pages; loop++) {
15154 +                       unsigned long pfn = zone->zone_start_pfn + loop;
15155 +                       struct page *page;
15156 +                       int chunk_size;
15157 +
15158 +                       if (!pfn_valid(pfn))
15159 +                               continue;
15160 +
15161 +                       page = pfn_to_page(pfn);
15162 +
15163 +                       chunk_size = size_of_free_region(page);
15164 +                       if (chunk_size) {
15165 +                               num_free += chunk_size;
15166 +                               loop += chunk_size - 1;
15167 +                               continue;
15168 +                       }
15169 +
15170 +                       if (highmem)
15171 +                               page = saveable_highmem_page(pfn);
15172 +                       else
15173 +                               page = saveable_page(pfn);
15174 +
15175 +                       if (!page || PageNosave(page)) {
15176 +                               num_nosave++;
15177 +                               continue;
15178 +                       }
15179 +
15180 +                       if (PagePageset2(page)) {
15181 +                               pagedir2.size++;
15182 +                               if (PageHighMem(page))
15183 +                                       inc_highmem_size(pagedir2);
15184 +                               else
15185 +                                       SetPagePageset1Copy(page);
15186 +                               if (PageResave(page)) {
15187 +                                       SetPagePageset1(page);
15188 +                                       ClearPagePageset1Copy(page);
15189 +                                       pagedir1.size++;
15190 +                                       if (PageHighMem(page))
15191 +                                               inc_highmem_size(pagedir1);
15192 +                               }
15193 +                       } else {
15194 +                               pagedir1.size++;
15195 +                               SetPagePageset1(page);
15196 +                               if (PageHighMem(page))
15197 +                                       inc_highmem_size(pagedir1);
15198 +                       }
15199 +               }
15200 +       }
15201 +
15202 +       if (atomic_copy)
15203 +               return;
15204 +
15205 +       toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
15206 +               "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld) + "
15207 +               "NumFree (%d) = %d.\n",
15208 +               pagedir1.size, pagedir2.size, num_nosave, num_free,
15209 +               pagedir1.size + pagedir2.size + num_nosave + num_free);
15210 +}
15211 +
15212 +void toi_recalculate_image_contents(int atomic_copy)
15213 +{
15214 +       clear_dyn_pageflags(&pageset1_map);
15215 +       if (!atomic_copy) {
15216 +               int pfn;
15217 +               BITMAP_FOR_EACH_SET(&pageset2_map, pfn)
15218 +                       ClearPagePageset1Copy(pfn_to_page(pfn));
15219 +               /* Need to call this before getting pageset1_size! */
15220 +               toi_mark_pages_for_pageset2();
15221 +       }
15222 +       flag_image_pages(atomic_copy);
15223 +
15224 +       if (!atomic_copy) {
15225 +               storage_available = toiActiveAllocator->storage_available();
15226 +               display_stats(0, 0);
15227 +       }
15228 +}
15229 +
15230 +/* update_image
15231 + *
15232 + * Allocate [more] memory and storage for the image.
15233 + */
15234 +static void update_image(void)
15235 +{
15236 +       int wanted, got;
15237 +       long seek;
15238 +
15239 +       toi_recalculate_image_contents(0);
15240 +
15241 +       /* Include allowance for growth in pagedir1 while writing pagedir 2 */
15242 +       wanted = pagedir1.size +  extra_pd1_pages_allowance -
15243 +               get_lowmem_size(pagedir2);
15244 +       if (wanted > extra_pages_allocated) {
15245 +               got = toi_allocate_extra_pagedir_memory(wanted);
15246 +               if (wanted < got) {
15247 +                       toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
15248 +                               "Want %d extra pages for pageset1, got %d.\n",
15249 +                               wanted, got);
15250 +                       return;
15251 +               }
15252 +       }
15253 +
15254 +       thaw_kernel_threads();
15255 +
15256 +       /*
15257 +        * Allocate remaining storage space, if possible, up to the
15258 +        * maximum we know we'll need. It's okay to allocate the
15259 +        * maximum if the writer is the swapwriter, but
15260 +        * we don't want to grab all available space on an NFS share.
15261 +        * We therefore ignore the expected compression ratio here,
15262 +        * thereby trying to allocate the maximum image size we could
15263 +        * need (assuming compression doesn't expand the image), but
15264 +        * don't complain if we can't get the full amount we're after.
15265 +        */
15266 +
15267 +       storage_available = toiActiveAllocator->storage_available();
15268 +
15269 +       header_space_allocated = header_storage_needed();
15270 +
15271 +       toiActiveAllocator->reserve_header_space(header_space_allocated);
15272 +
15273 +       seek = min(storage_available, main_storage_needed(0, 0));
15274 +
15275 +       toiActiveAllocator->allocate_storage(seek);
15276 +
15277 +       main_storage_allocated = toiActiveAllocator->storage_allocated();
15278 +
15279 +       if (freeze_processes())
15280 +               set_abort_result(TOI_FREEZING_FAILED);
15281 +
15282 +       toi_recalculate_image_contents(0);
15283 +}
15284 +
15285 +/* attempt_to_freeze
15286 + *
15287 + * Try to freeze processes.
15288 + */
15289 +
15290 +static int attempt_to_freeze(void)
15291 +{
15292 +       int result;
15293 +
15294 +       /* Stop processes before checking again */
15295 +       thaw_processes();
15296 +       toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
15297 +                       "filesystems.");
15298 +       result = freeze_processes();
15299 +
15300 +       if (result)
15301 +               set_abort_result(TOI_FREEZING_FAILED);
15302 +
15303 +       return result;
15304 +}
15305 +
15306 +/* eat_memory
15307 + *
15308 + * Try to free some memory, either to meet hard or soft constraints on the image
15309 + * characteristics.
15310 + *
15311 + * Hard constraints:
15312 + * - Pageset1 must be < half of memory;
15313 + * - We must have enough memory free at resume time to have pageset1
15314 + *   be able to be loaded in pages that don't conflict with where it has to
15315 + *   be restored.
15316 + * Soft constraints
15317 + * - User specificied image size limit.
15318 + */
15319 +static void eat_memory(void)
15320 +{
15321 +       long amount_wanted = 0;
15322 +       int did_eat_memory = 0;
15323 +
15324 +       /*
15325 +        * Note that if we have enough storage space and enough free memory, we
15326 +        * may exit without eating anything. We give up when the last 10
15327 +        * iterations ate no extra pages because we're not going to get much
15328 +        * more anyway, but the few pages we get will take a lot of time.
15329 +        *
15330 +        * We freeze processes before beginning, and then unfreeze them if we
15331 +        * need to eat memory until we think we have enough. If our attempts
15332 +        * to freeze fail, we give up and abort.
15333 +        */
15334 +
15335 +       toi_recalculate_image_contents(0);
15336 +       amount_wanted = amount_needed(1);
15337 +
15338 +       switch (image_size_limit) {
15339 +       case -1: /* Don't eat any memory */
15340 +               if (amount_wanted > 0) {
15341 +                       set_abort_result(TOI_WOULD_EAT_MEMORY);
15342 +                       return;
15343 +               }
15344 +               break;
15345 +       case -2:  /* Free caches only */
15346 +               drop_pagecache();
15347 +               toi_recalculate_image_contents(0);
15348 +               amount_wanted = amount_needed(1);
15349 +               did_eat_memory = 1;
15350 +               break;
15351 +       default:
15352 +               break;
15353 +       }
15354 +
15355 +       if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
15356 +                       image_size_limit != -1) {
15357 +               struct zone *zone;
15358 +               int zone_idx;
15359 +
15360 +               toi_prepare_status(CLEAR_BAR,
15361 +                               "Seeking to free %ldMB of memory.",
15362 +                               MB(amount_wanted));
15363 +
15364 +               thaw_kernel_threads();
15365 +
15366 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
15367 +                       unsigned long zone_type_free = max_t(long,
15368 +                                       (zone_idx == ZONE_HIGHMEM) ?
15369 +                                       highpages_ps1_to_free() :
15370 +                                       lowpages_ps1_to_free(), amount_wanted);
15371 +
15372 +                       if (zone_type_free < 0)
15373 +                               break;
15374 +
15375 +                       for_each_zone(zone) {
15376 +                               if (zone_idx(zone) != zone_idx)
15377 +                                       continue;
15378 +
15379 +                               shrink_one_zone(zone, zone_type_free, 3);
15380 +
15381 +                               did_eat_memory = 1;
15382 +
15383 +                               toi_recalculate_image_contents(0);
15384 +
15385 +                               amount_wanted = amount_needed(1);
15386 +                               zone_type_free = max_t(long,
15387 +                                       (zone_idx == ZONE_HIGHMEM) ?
15388 +                                       highpages_ps1_to_free() :
15389 +                                       lowpages_ps1_to_free(), amount_wanted);
15390 +
15391 +                               if (zone_type_free < 0)
15392 +                                       break;
15393 +                       }
15394 +               }
15395 +
15396 +               toi_cond_pause(0, NULL);
15397 +
15398 +               if (freeze_processes())
15399 +                       set_abort_result(TOI_FREEZING_FAILED);
15400 +       }
15401 +
15402 +       if (did_eat_memory) {
15403 +               unsigned long orig_state = get_toi_state();
15404 +               /* Freeze_processes will call sys_sync too */
15405 +               restore_toi_state(orig_state);
15406 +               toi_recalculate_image_contents(0);
15407 +       }
15408 +
15409 +       /* Blank out image size display */
15410 +       toi_update_status(100, 100, NULL);
15411 +}
15412 +
15413 +/* toi_prepare_image
15414 + *
15415 + * Entry point to the whole image preparation section.
15416 + *
15417 + * We do four things:
15418 + * - Freeze processes;
15419 + * - Ensure image size constraints are met;
15420 + * - Complete all the preparation for saving the image,
15421 + *   including allocation of storage. The only memory
15422 + *   that should be needed when we're finished is that
15423 + *   for actually storing the image (and we know how
15424 + *   much is needed for that because the modules tell
15425 + *   us).
15426 + * - Make sure that all dirty buffers are written out.
15427 + */
15428 +#define MAX_TRIES 2
15429 +int toi_prepare_image(void)
15430 +{
15431 +       int result = 1, tries = 1;
15432 +
15433 +       header_space_allocated = 0;
15434 +       main_storage_allocated = 0;
15435 +
15436 +       if (attempt_to_freeze())
15437 +               return 1;
15438 +
15439 +       if (!extra_pd1_pages_allowance)
15440 +               get_extra_pd1_allowance();
15441 +
15442 +       storage_available = toiActiveAllocator->storage_available();
15443 +
15444 +       if (!storage_available) {
15445 +               printk(KERN_INFO "No storage available. Didn't try to prepare "
15446 +                               "an image.\n");
15447 +               display_failure_reason(0);
15448 +               set_abort_result(TOI_NOSTORAGE_AVAILABLE);
15449 +               return 1;
15450 +       }
15451 +
15452 +       if (build_attention_list()) {
15453 +               abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
15454 +                               "Unable to successfully prepare the image.\n");
15455 +               return 1;
15456 +       }
15457 +
15458 +       do {
15459 +               toi_prepare_status(CLEAR_BAR,
15460 +                               "Preparing Image. Try %d.", tries);
15461 +
15462 +               eat_memory();
15463 +
15464 +               if (test_result_state(TOI_ABORTED))
15465 +                       break;
15466 +
15467 +               update_image();
15468 +
15469 +               tries++;
15470 +
15471 +       } while (image_not_ready(1) && tries <= MAX_TRIES &&
15472 +                       !test_result_state(TOI_ABORTED));
15473 +
15474 +       result = image_not_ready(0);
15475 +
15476 +       if (!test_result_state(TOI_ABORTED)) {
15477 +               if (result) {
15478 +                       display_stats(1, 0);
15479 +                       display_failure_reason(tries > MAX_TRIES);
15480 +                       abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
15481 +                               "Unable to successfully prepare the image.\n");
15482 +               } else {
15483 +                       unlink_lru_lists();
15484 +                       toi_cond_pause(1, "Image preparation complete.");
15485 +               }
15486 +       }
15487 +
15488 +       return result ? result : allocate_checksum_pages();
15489 +}
15490 +
15491 +#ifdef CONFIG_TOI_EXPORTS
15492 +EXPORT_SYMBOL_GPL(real_nr_free_pages);
15493 +#endif
15494 diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
15495 new file mode 100644
15496 index 0000000..428ab11
15497 --- /dev/null
15498 +++ b/kernel/power/tuxonice_prepare_image.h
15499 @@ -0,0 +1,35 @@
15500 +/*
15501 + * kernel/power/tuxonice_prepare_image.h
15502 + *
15503 + * Copyright (C) 2003-2007 Nigel Cunningham (nigel at tuxonice net)
15504 + *
15505 + * This file is released under the GPLv2.
15506 + *
15507 + */
15508 +
15509 +#include <asm/sections.h>
15510 +
15511 +extern int toi_prepare_image(void);
15512 +extern void toi_recalculate_image_contents(int storage_available);
15513 +extern long real_nr_free_pages(unsigned long zone_idx_mask);
15514 +extern int image_size_limit;
15515 +extern void toi_free_extra_pagedir_memory(void);
15516 +extern long extra_pd1_pages_allowance;
15517 +extern void free_attention_list(void);
15518 +
15519 +#define MIN_FREE_RAM 100
15520 +#define MIN_EXTRA_PAGES_ALLOWANCE 500
15521 +
15522 +#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
15523 +#ifdef CONFIG_HIGHMEM
15524 +#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
15525 +#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
15526 +                                               (1 << ZONE_HIGHMEM)))
15527 +#else
15528 +#define real_nr_free_high_pages() (0)
15529 +#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
15530 +
15531 +/* For eat_memory function */
15532 +#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
15533 +#endif
15534 +
15535 diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
15536 new file mode 100644
15537 index 0000000..c3352f6
15538 --- /dev/null
15539 +++ b/kernel/power/tuxonice_storage.c
15540 @@ -0,0 +1,293 @@
15541 +/*
15542 + * kernel/power/tuxonice_storage.c
15543 + *
15544 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
15545 + *
15546 + * This file is released under the GPLv2.
15547 + *
15548 + * Routines for talking to a userspace program that manages storage.
15549 + *
15550 + * The kernel side:
15551 + * - starts the userspace program;
15552 + * - sends messages telling it when to open and close the connection;
15553 + * - tells it when to quit;
15554 + *
15555 + * The user space side:
15556 + * - passes messages regarding status;
15557 + *
15558 + */
15559 +
15560 +#include <linux/suspend.h>
15561 +#include <linux/freezer.h>
15562 +
15563 +#include "tuxonice_sysfs.h"
15564 +#include "tuxonice_modules.h"
15565 +#include "tuxonice_netlink.h"
15566 +#include "tuxonice_storage.h"
15567 +#include "tuxonice_ui.h"
15568 +
15569 +static struct user_helper_data usm_helper_data;
15570 +static struct toi_module_ops usm_ops;
15571 +static int message_received, usm_prepare_count;
15572 +static int storage_manager_last_action, storage_manager_action;
15573 +
15574 +static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
15575 +{
15576 +       int type;
15577 +       int *data;
15578 +
15579 +       type = nlh->nlmsg_type;
15580 +
15581 +       /* A control message: ignore them */
15582 +       if (type < NETLINK_MSG_BASE)
15583 +               return 0;
15584 +
15585 +       /* Unknown message: reply with EINVAL */
15586 +       if (type >= USM_MSG_MAX)
15587 +               return -EINVAL;
15588 +
15589 +       /* All operations require privileges, even GET */
15590 +       if (security_netlink_recv(skb, CAP_NET_ADMIN))
15591 +               return -EPERM;
15592 +
15593 +       /* Only allow one task to receive NOFREEZE privileges */
15594 +       if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
15595 +               return -EBUSY;
15596 +
15597 +       data = (int *) NLMSG_DATA(nlh);
15598 +
15599 +       switch (type) {
15600 +       case USM_MSG_SUCCESS:
15601 +       case USM_MSG_FAILED:
15602 +               message_received = type;
15603 +               complete(&usm_helper_data.wait_for_process);
15604 +               break;
15605 +       default:
15606 +               printk(KERN_INFO "Storage manager doesn't recognise "
15607 +                               "message %d.\n", type);
15608 +       }
15609 +
15610 +       return 1;
15611 +}
15612 +
15613 +#ifdef CONFIG_NET
15614 +static int activations;
15615 +
15616 +int toi_activate_storage(int force)
15617 +{
15618 +       int tries = 1;
15619 +
15620 +       if (usm_helper_data.pid == -1 || !usm_ops.enabled)
15621 +               return 0;
15622 +
15623 +       message_received = 0;
15624 +       activations++;
15625 +
15626 +       if (activations > 1 && !force)
15627 +               return 0;
15628 +
15629 +       while ((!message_received || message_received == USM_MSG_FAILED) &&
15630 +                       tries < 2) {
15631 +               toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
15632 +                               "%d.\n", tries);
15633 +
15634 +               init_completion(&usm_helper_data.wait_for_process);
15635 +
15636 +               toi_send_netlink_message(&usm_helper_data,
15637 +                       USM_MSG_CONNECT,
15638 +                       NULL, 0);
15639 +
15640 +               /* Wait 2 seconds for the userspace process to make contact */
15641 +               wait_for_completion_timeout(&usm_helper_data.wait_for_process,
15642 +                               2*HZ);
15643 +
15644 +               tries++;
15645 +       }
15646 +
15647 +       return 0;
15648 +}
15649 +
15650 +int toi_deactivate_storage(int force)
15651 +{
15652 +       if (usm_helper_data.pid == -1 || !usm_ops.enabled)
15653 +               return 0;
15654 +
15655 +       message_received = 0;
15656 +       activations--;
15657 +
15658 +       if (activations && !force)
15659 +               return 0;
15660 +
15661 +       init_completion(&usm_helper_data.wait_for_process);
15662 +
15663 +       toi_send_netlink_message(&usm_helper_data,
15664 +                       USM_MSG_DISCONNECT,
15665 +                       NULL, 0);
15666 +
15667 +       wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
15668 +
15669 +       if (!message_received || message_received == USM_MSG_FAILED) {
15670 +               printk(KERN_INFO "Returning failure disconnecting storage.\n");
15671 +               return 1;
15672 +       }
15673 +
15674 +       return 0;
15675 +}
15676 +#endif
15677 +
15678 +static void storage_manager_simulate(void)
15679 +{
15680 +       printk(KERN_INFO "--- Storage manager simulate ---\n");
15681 +       toi_prepare_usm();
15682 +       schedule();
15683 +       printk(KERN_INFO "--- Activate storage 1 ---\n");
15684 +       toi_activate_storage(1);
15685 +       schedule();
15686 +       printk(KERN_INFO "--- Deactivate storage 1 ---\n");
15687 +       toi_deactivate_storage(1);
15688 +       schedule();
15689 +       printk(KERN_INFO "--- Cleanup usm ---\n");
15690 +       toi_cleanup_usm();
15691 +       schedule();
15692 +       printk(KERN_INFO "--- Storage manager simulate ends ---\n");
15693 +}
15694 +
15695 +static int usm_storage_needed(void)
15696 +{
15697 +       return strlen(usm_helper_data.program);
15698 +}
15699 +
15700 +static int usm_save_config_info(char *buf)
15701 +{
15702 +       int len = strlen(usm_helper_data.program);
15703 +       memcpy(buf, usm_helper_data.program, len);
15704 +       return len;
15705 +}
15706 +
15707 +static void usm_load_config_info(char *buf, int size)
15708 +{
15709 +       /* Don't load the saved path if one has already been set */
15710 +       if (usm_helper_data.program[0])
15711 +               return;
15712 +
15713 +       memcpy(usm_helper_data.program, buf, size);
15714 +}
15715 +
15716 +static int usm_memory_needed(void)
15717 +{
15718 +       /* ball park figure of 32 pages */
15719 +       return 32 * PAGE_SIZE;
15720 +}
15721 +
15722 +/* toi_prepare_usm
15723 + */
15724 +int toi_prepare_usm(void)
15725 +{
15726 +       usm_prepare_count++;
15727 +
15728 +       if (usm_prepare_count > 1 || !usm_ops.enabled)
15729 +               return 0;
15730 +
15731 +       usm_helper_data.pid = -1;
15732 +
15733 +       if (!*usm_helper_data.program)
15734 +               return 0;
15735 +
15736 +       toi_netlink_setup(&usm_helper_data);
15737 +
15738 +       if (usm_helper_data.pid == -1)
15739 +               printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
15740 +                               " start it.\n");
15741 +
15742 +       toi_activate_storage(0);
15743 +
15744 +       return usm_helper_data.pid != -1;
15745 +}
15746 +
15747 +void toi_cleanup_usm(void)
15748 +{
15749 +       usm_prepare_count--;
15750 +
15751 +       if (usm_helper_data.pid > -1 && !usm_prepare_count) {
15752 +               toi_deactivate_storage(0);
15753 +               toi_netlink_close(&usm_helper_data);
15754 +       }
15755 +}
15756 +
15757 +static void storage_manager_activate(void)
15758 +{
15759 +       if (storage_manager_action == storage_manager_last_action)
15760 +               return;
15761 +
15762 +       if (storage_manager_action)
15763 +               toi_prepare_usm();
15764 +       else
15765 +               toi_cleanup_usm();
15766 +
15767 +       storage_manager_last_action = storage_manager_action;
15768 +}
15769 +
15770 +/*
15771 + * User interface specific /sys/power/tuxonice entries.
15772 + */
15773 +
15774 +static struct toi_sysfs_data sysfs_params[] = {
15775 +       { TOI_ATTR("simulate_atomic_copy", SYSFS_RW),
15776 +         .type                         = TOI_SYSFS_DATA_NONE,
15777 +         .write_side_effect            = storage_manager_simulate,
15778 +       },
15779 +
15780 +       { TOI_ATTR("enabled", SYSFS_RW),
15781 +         SYSFS_INT(&usm_ops.enabled, 0, 1, 0)
15782 +       },
15783 +
15784 +       { TOI_ATTR("program", SYSFS_RW),
15785 +         SYSFS_STRING(usm_helper_data.program, 254, 0)
15786 +       },
15787 +
15788 +       { TOI_ATTR("activate_storage", SYSFS_RW),
15789 +         SYSFS_INT(&storage_manager_action, 0, 1, 0),
15790 +         .write_side_effect            = storage_manager_activate,
15791 +       }
15792 +};
15793 +
15794 +static struct toi_module_ops usm_ops = {
15795 +       .type                           = MISC_MODULE,
15796 +       .name                           = "usm",
15797 +       .directory                      = "storage_manager",
15798 +       .module                         = THIS_MODULE,
15799 +       .storage_needed                 = usm_storage_needed,
15800 +       .save_config_info               = usm_save_config_info,
15801 +       .load_config_info               = usm_load_config_info,
15802 +       .memory_needed                  = usm_memory_needed,
15803 +
15804 +       .sysfs_data                     = sysfs_params,
15805 +       .num_sysfs_entries              = sizeof(sysfs_params) /
15806 +               sizeof(struct toi_sysfs_data),
15807 +};
15808 +
15809 +/* toi_usm_sysfs_init
15810 + * Description: Boot time initialisation for user interface.
15811 + */
15812 +int toi_usm_init(void)
15813 +{
15814 +       usm_helper_data.nl = NULL;
15815 +       usm_helper_data.program[0] = '\0';
15816 +       usm_helper_data.pid = -1;
15817 +       usm_helper_data.skb_size = 0;
15818 +       usm_helper_data.pool_limit = 6;
15819 +       usm_helper_data.netlink_id = NETLINK_TOI_USM;
15820 +       usm_helper_data.name = "userspace storage manager";
15821 +       usm_helper_data.rcv_msg = usm_user_rcv_msg;
15822 +       usm_helper_data.interface_version = 1;
15823 +       usm_helper_data.must_init = 0;
15824 +       init_completion(&usm_helper_data.wait_for_process);
15825 +
15826 +       return toi_register_module(&usm_ops);
15827 +}
15828 +
15829 +void toi_usm_exit(void)
15830 +{
15831 +       toi_netlink_close_complete(&usm_helper_data);
15832 +       toi_unregister_module(&usm_ops);
15833 +}
15834 diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
15835 new file mode 100644
15836 index 0000000..2f895bf
15837 --- /dev/null
15838 +++ b/kernel/power/tuxonice_storage.h
15839 @@ -0,0 +1,53 @@
15840 +/*
15841 + * kernel/power/tuxonice_storage.h
15842 + *
15843 + * Copyright (C) 2005-2007 Nigel Cunningham (nigel at tuxonice net)
15844 + *
15845 + * This file is released under the GPLv2.
15846 + */
15847 +
15848 +#ifdef CONFIG_NET
15849 +int toi_prepare_usm(void);
15850 +void toi_cleanup_usm(void);
15851 +
15852 +int toi_activate_storage(int force);
15853 +int toi_deactivate_storage(int force);
15854 +extern int toi_usm_init(void);
15855 +extern void toi_usm_exit(void);
15856 +#else
15857 +static inline int toi_usm_init(void) { return 0; }
15858 +static inline void toi_usm_exit(void) { }
15859 +
15860 +static inline int toi_activate_storage(int force)
15861 +{
15862 +       return 0;
15863 +}
15864 +
15865 +static inline int toi_deactivate_storage(int force)
15866 +{
15867 +       return 0;
15868 +}
15869 +
15870 +static inline int toi_prepare_usm(void) { return 0; }
15871 +static inline void toi_cleanup_usm(void) { }
15872 +#endif
15873 +
15874 +enum {
15875 +       USM_MSG_BASE = 0x10,
15876 +
15877 +       /* Kernel -> Userspace */
15878 +       USM_MSG_CONNECT = 0x30,
15879 +       USM_MSG_DISCONNECT = 0x31,
15880 +       USM_MSG_SUCCESS = 0x40,
15881 +       USM_MSG_FAILED = 0x41,
15882 +
15883 +       USM_MSG_MAX,
15884 +};
15885 +
15886 +#ifdef CONFIG_NET
15887 +extern __init int toi_usm_init(void);
15888 +extern __exit void toi_usm_cleanup(void);
15889 +#else
15890 +#define toi_usm_init() do { } while (0)
15891 +#define toi_usm_cleanup() do { } while (0)
15892 +#endif
15893 diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
15894 new file mode 100644
15895 index 0000000..aaeff27
15896 --- /dev/null
15897 +++ b/kernel/power/tuxonice_swap.c
15898 @@ -0,0 +1,1284 @@
15899 +/*
15900 + * kernel/power/tuxonice_swap.c
15901 + *
15902 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
15903 + *
15904 + * Distributed under GPLv2.
15905 + *
15906 + * This file encapsulates functions for usage of swap space as a
15907 + * backing store.
15908 + */
15909 +
15910 +#include <linux/suspend.h>
15911 +#include <linux/module.h>
15912 +#include <linux/blkdev.h>
15913 +#include <linux/swapops.h>
15914 +#include <linux/swap.h>
15915 +#include <linux/syscalls.h>
15916 +
15917 +#include "tuxonice.h"
15918 +#include "tuxonice_sysfs.h"
15919 +#include "tuxonice_modules.h"
15920 +#include "tuxonice_io.h"
15921 +#include "tuxonice_ui.h"
15922 +#include "tuxonice_extent.h"
15923 +#include "tuxonice_block_io.h"
15924 +#include "tuxonice_alloc.h"
15925 +
15926 +static struct toi_module_ops toi_swapops;
15927 +
15928 +/* --- Struct of pages stored on disk */
15929 +
15930 +struct sig_data {
15931 +       dev_t device;
15932 +       unsigned long sector;
15933 +       int resume_attempted;
15934 +       int orig_sig_type;
15935 +};
15936 +
15937 +union diskpage {
15938 +       union swap_header swh;  /* swh.magic is the only member used */
15939 +       struct sig_data sig_data;
15940 +};
15941 +
15942 +union p_diskpage {
15943 +       union diskpage *pointer;
15944 +       char *ptr;
15945 +       unsigned long address;
15946 +};
15947 +
15948 +enum {
15949 +       IMAGE_SIGNATURE,
15950 +       NO_IMAGE_SIGNATURE,
15951 +       TRIED_RESUME,
15952 +       NO_TRIED_RESUME,
15953 +};
15954 +
15955 +/*
15956 + * Both of these point to versions of the swap header page. original_sig points
15957 + * to the data we read from disk at the start of hibernating or checking whether
15958 + * to resume. no_image is the page stored in the image header, showing what the
15959 + * swap header page looked like at the start of hibernating.
15960 + */
15961 +static char *current_signature_page;
15962 +static char no_image_signature_contents[sizeof(struct sig_data)];
15963 +
15964 +/* Devices used for swap */
15965 +static struct toi_bdev_info devinfo[MAX_SWAPFILES];
15966 +
15967 +/* Extent chains for swap & blocks */
15968 +struct hibernate_extent_chain swapextents;
15969 +struct hibernate_extent_chain block_chain[MAX_SWAPFILES];
15970 +
15971 +static dev_t header_dev_t;
15972 +static struct block_device *header_block_device;
15973 +static unsigned long headerblock;
15974 +
15975 +/* For swapfile automatically swapon/off'd. */
15976 +static char swapfilename[32] = "";
15977 +static int toi_swapon_status;
15978 +
15979 +/* Header Page Information */
15980 +static long header_pages_reserved;
15981 +
15982 +/* Swap Pages */
15983 +static long swap_pages_allocated;
15984 +
15985 +/* User Specified Parameters. */
15986 +
15987 +static unsigned long resume_firstblock;
15988 +static dev_t resume_swap_dev_t;
15989 +static struct block_device *resume_block_device;
15990 +
15991 +struct sysinfo swapinfo;
15992 +
15993 +/* Block devices open. */
15994 +struct bdev_opened {
15995 +       dev_t device;
15996 +       struct block_device *bdev;
15997 +};
15998 +
15999 +/*
16000 + * Entry MAX_SWAPFILES is the resume block device, which may
16001 + * be a swap device not enabled when we hibernate.
16002 + * Entry MAX_SWAPFILES + 1 is the header block device, which
16003 + * is needed before we find out which slot it occupies.
16004 + *
16005 + * We use a separate struct to devInfo so that we can track
16006 + * the bdevs we open, because if we need to abort resuming
16007 + * prior to the atomic restore, they need to be closed, but
16008 + * closing them after sucessfully resuming would be wrong.
16009 + */
16010 +static struct bdev_opened *bdevs_opened[MAX_SWAPFILES + 2];
16011 +
16012 +/**
16013 + * close_bdev: Close a swap bdev.
16014 + *
16015 + * int: The swap entry number to close.
16016 + */
16017 +static void close_bdev(int i)
16018 +{
16019 +       struct bdev_opened *this = bdevs_opened[i];
16020 +
16021 +       if (!this)
16022 +               return;
16023 +
16024 +       blkdev_put(this->bdev);
16025 +       toi_kfree(8, this);
16026 +       bdevs_opened[i] = NULL;
16027 +}
16028 +
16029 +/**
16030 + * close_bdevs: Close all bdevs we opened.
16031 + *
16032 + * Close all bdevs that we opened and reset the related vars.
16033 + */
16034 +static void close_bdevs(void)
16035 +{
16036 +       int i;
16037 +
16038 +       for (i = 0; i < MAX_SWAPFILES + 2; i++)
16039 +               close_bdev(i);
16040 +
16041 +       resume_block_device = header_block_device = NULL;
16042 +}
16043 +
16044 +/**
16045 + * open_bdev: Open a bdev at resume time.
16046 + *
16047 + * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
16048 + * (the user can have resume= pointing at a swap partition/file that isn't
16049 + * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
16050 + * header. It will be from a swap partition that was enabled when we hibernated,
16051 + * but we don't know it's real index until we read that first page.
16052 + * dev_t: The device major/minor.
16053 + * display_errs: Whether to try to do this quietly.
16054 + *
16055 + * We stored a dev_t in the image header. Open the matching device without
16056 + * requiring /dev/<whatever> in most cases and record the details needed
16057 + * to close it later and avoid duplicating work.
16058 + */
16059 +static struct block_device *open_bdev(int index, dev_t device, int display_errs)
16060 +{
16061 +       struct bdev_opened *this;
16062 +       struct block_device *bdev;
16063 +
16064 +       if (bdevs_opened[index]) {
16065 +               if (bdevs_opened[index]->device == device)
16066 +                       return bdevs_opened[index]->bdev;
16067 +
16068 +               close_bdev(index);
16069 +       }
16070 +
16071 +       bdev = toi_open_by_devnum(device, FMODE_READ);
16072 +
16073 +       if (IS_ERR(bdev) || !bdev) {
16074 +               if (display_errs)
16075 +                       toi_early_boot_message(1, TOI_CONTINUE_REQ,
16076 +                               "Failed to get access to block device "
16077 +                               "\"%x\" (error %d).\n Maybe you need "
16078 +                               "to run mknod and/or lvmsetup in an "
16079 +                               "initrd/ramfs?", device, bdev);
16080 +               return ERR_PTR(-EINVAL);
16081 +       }
16082 +
16083 +       this = toi_kzalloc(8, sizeof(struct bdev_opened), GFP_KERNEL);
16084 +       if (!this) {
16085 +               printk(KERN_WARNING "TuxOnIce: Failed to allocate memory for "
16086 +                               "opening a bdev.");
16087 +               blkdev_put(bdev);
16088 +               return ERR_PTR(-ENOMEM);
16089 +       }
16090 +
16091 +       bdevs_opened[index] = this;
16092 +       this->device = device;
16093 +       this->bdev = bdev;
16094 +
16095 +       return bdev;
16096 +}
16097 +
16098 +/**
16099 + * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
16100 + *
16101 + * Activate the given swapfile if it wasn't already enabled. Remember whether
16102 + * we really did swapon it for swapoffing later.
16103 + */
16104 +static void enable_swapfile(void)
16105 +{
16106 +       int activateswapresult = -EINVAL;
16107 +
16108 +       if (swapfilename[0]) {
16109 +               /* Attempt to swap on with maximum priority */
16110 +               activateswapresult = sys_swapon(swapfilename, 0xFFFF);
16111 +               if (activateswapresult && activateswapresult != -EBUSY)
16112 +                       printk("TuxOnIce: The swapfile/partition specified by "
16113 +                               "/sys/power/tuxonice/swap/swapfile "
16114 +                               "(%s) could not be turned on (error %d). "
16115 +                               "Attempting to continue.\n",
16116 +                               swapfilename, activateswapresult);
16117 +               if (!activateswapresult)
16118 +                       toi_swapon_status = 1;
16119 +       }
16120 +}
16121 +
16122 +/**
16123 + * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
16124 + *
16125 + * If we did successfully swapon a file at the start of the cycle, swapoff
16126 + * it now (finishing up).
16127 + */
16128 +static void disable_swapfile(void)
16129 +{
16130 +       if (!toi_swapon_status)
16131 +               return;
16132 +
16133 +       sys_swapoff(swapfilename);
16134 +       toi_swapon_status = 0;
16135 +}
16136 +
16137 +/**
16138 + * try_to_parse_resume_device: Try to parse resume=
16139 + *
16140 + * Any "swap:" has been stripped away and we just have the path to deal with.
16141 + * We attempt to do name_to_dev_t, open and stat the file. Having opened the
16142 + * file, get the struct block_device * to match.
16143 + */
16144 +static int try_to_parse_resume_device(char *commandline, int quiet)
16145 +{
16146 +       struct kstat stat;
16147 +       int error = 0;
16148 +
16149 +       resume_swap_dev_t = name_to_dev_t(commandline);
16150 +
16151 +       if (!resume_swap_dev_t) {
16152 +               struct file *file = filp_open(commandline,
16153 +                               O_RDONLY|O_LARGEFILE, 0);
16154 +
16155 +               if (!IS_ERR(file) && file) {
16156 +                       vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
16157 +                       filp_close(file, NULL);
16158 +               } else
16159 +                       error = vfs_stat(commandline, &stat);
16160 +               if (!error)
16161 +                       resume_swap_dev_t = stat.rdev;
16162 +       }
16163 +
16164 +       if (!resume_swap_dev_t) {
16165 +               if (quiet)
16166 +                       return 1;
16167 +
16168 +               if (test_toi_state(TOI_TRYING_TO_RESUME))
16169 +                       toi_early_boot_message(1, TOI_CONTINUE_REQ,
16170 +                         "Failed to translate \"%s\" into a device id.\n",
16171 +                         commandline);
16172 +               else
16173 +                       printk("TuxOnIce: Can't translate \"%s\" into a device "
16174 +                                       "id yet.\n", commandline);
16175 +               return 1;
16176 +       }
16177 +
16178 +       resume_block_device = open_bdev(MAX_SWAPFILES, resume_swap_dev_t, 0);
16179 +       if (IS_ERR(resume_block_device)) {
16180 +               if (!quiet)
16181 +                       toi_early_boot_message(1, TOI_CONTINUE_REQ,
16182 +                               "Failed to get access to \"%s\", where"
16183 +                               " the swap header should be found.",
16184 +                               commandline);
16185 +               return 1;
16186 +       }
16187 +
16188 +       return 0;
16189 +}
16190 +
16191 +/*
16192 + * If we have read part of the image, we might have filled  memory with
16193 + * data that should be zeroed out.
16194 + */
16195 +static void toi_swap_noresume_reset(void)
16196 +{
16197 +       toi_bio_ops.rw_cleanup(READ);
16198 +       memset((char *) &devinfo, 0, sizeof(devinfo));
16199 +}
16200 +
16201 +static int get_current_signature(void)
16202 +{
16203 +       int result;
16204 +
16205 +       if (current_signature_page)
16206 +               return 0;
16207 +
16208 +       current_signature_page = (char *) toi_get_zeroed_page(38,
16209 +                       TOI_ATOMIC_GFP);
16210 +       if (!current_signature_page)
16211 +               return -ENOMEM;
16212 +
16213 +       result = toi_bio_ops.bdev_page_io(READ, resume_block_device,
16214 +               resume_firstblock, virt_to_page(current_signature_page));
16215 +
16216 +       return result;
16217 +}
16218 +
16219 +static int parse_signature(void)
16220 +{
16221 +       union p_diskpage swap_header_page;
16222 +       struct sig_data *sig;
16223 +       int type;
16224 +       char *swap_header;
16225 +       const char *sigs[] = {
16226 +               "SWAP-SPACE", "SWAPSPACE2", "S1SUSP", "S2SUSP", "S1SUSPEND"
16227 +       };
16228 +
16229 +       if (!current_signature_page) {
16230 +               int result = get_current_signature();
16231 +
16232 +               if (result)
16233 +                       return result;
16234 +       }
16235 +
16236 +       swap_header_page = (union p_diskpage) current_signature_page;
16237 +       sig = (struct sig_data *) current_signature_page;
16238 +       swap_header = swap_header_page.pointer->swh.magic.magic;
16239 +
16240 +       for (type = 0; type < 5; type++)
16241 +               if (!memcmp(sigs[type], swap_header, strlen(sigs[type])))
16242 +                       return type;
16243 +
16244 +       if (memcmp(tuxonice_signature, swap_header, sizeof(tuxonice_signature)))
16245 +               return -1;
16246 +
16247 +       header_dev_t = sig->device;
16248 +       clear_toi_state(TOI_RESUMED_BEFORE);
16249 +       if (sig->resume_attempted)
16250 +               set_toi_state(TOI_RESUMED_BEFORE);
16251 +       headerblock = sig->sector;
16252 +
16253 +       return 10;
16254 +}
16255 +
16256 +static void forget_signatures(void)
16257 +{
16258 +       if (current_signature_page) {
16259 +               toi_free_page(38, (unsigned long) current_signature_page);
16260 +               current_signature_page = NULL;
16261 +       }
16262 +}
16263 +
16264 +/*
16265 + * write_modified_signature
16266 + *
16267 + * Write a (potentially) modified signature page without forgetting the
16268 + * original contents.
16269 + */
16270 +static int write_modified_signature(int modification)
16271 +{
16272 +       union p_diskpage swap_header_page;
16273 +       struct swap_info_struct *si;
16274 +       int result;
16275 +       char *orig_sig;
16276 +
16277 +       /* In case we haven't already */
16278 +       result = get_current_signature();
16279 +
16280 +       if (result)
16281 +               return result;
16282 +
16283 +       swap_header_page.address = toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
16284 +
16285 +       if (!swap_header_page.address)
16286 +               return -ENOMEM;
16287 +
16288 +       memcpy(swap_header_page.ptr, current_signature_page, PAGE_SIZE);
16289 +
16290 +       switch (modification) {
16291 +       case IMAGE_SIGNATURE:
16292 +
16293 +               memcpy(no_image_signature_contents, swap_header_page.ptr,
16294 +                               sizeof(no_image_signature_contents));
16295 +
16296 +               /* Get the details of the header first page. */
16297 +               toi_extent_state_goto_start(&toi_writer_posn);
16298 +               toi_bio_ops.forward_one_page(1);
16299 +
16300 +               si = get_swap_info_struct(toi_writer_posn.current_chain);
16301 +
16302 +               /* Prepare the signature */
16303 +               swap_header_page.pointer->sig_data.device = si->bdev->bd_dev;
16304 +               swap_header_page.pointer->sig_data.sector =
16305 +                       toi_writer_posn.current_offset;
16306 +               swap_header_page.pointer->sig_data.resume_attempted = 0;
16307 +               swap_header_page.pointer->sig_data.orig_sig_type =
16308 +                       parse_signature();
16309 +
16310 +               memcpy(swap_header_page.pointer->swh.magic.magic,
16311 +                               tuxonice_signature, sizeof(tuxonice_signature));
16312 +
16313 +               break;
16314 +       case NO_IMAGE_SIGNATURE:
16315 +               if (!swap_header_page.pointer->sig_data.orig_sig_type)
16316 +                       orig_sig = "SWAP-SPACE";
16317 +               else
16318 +                       orig_sig = "SWAPSPACE2";
16319 +
16320 +               memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
16321 +               memcpy(swap_header_page.ptr, no_image_signature_contents,
16322 +                               sizeof(no_image_signature_contents));
16323 +               break;
16324 +       case TRIED_RESUME:
16325 +               swap_header_page.pointer->sig_data.resume_attempted = 1;
16326 +               break;
16327 +       case NO_TRIED_RESUME:
16328 +               swap_header_page.pointer->sig_data.resume_attempted = 0;
16329 +               break;
16330 +       }
16331 +
16332 +       result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
16333 +               resume_firstblock, virt_to_page(swap_header_page.address));
16334 +
16335 +       memcpy(current_signature_page, swap_header_page.ptr, PAGE_SIZE);
16336 +
16337 +       toi_free_page(38, swap_header_page.address);
16338 +
16339 +       return result;
16340 +}
16341 +
16342 +/*
16343 + * apply_header_reservation
16344 + *
16345 + * Use 0 (READ) to forward_one_page so it doesn't complain if we haven't
16346 + * allocated storage yet.
16347 + */
16348 +static int apply_header_reservation(void)
16349 +{
16350 +       int i;
16351 +
16352 +       toi_extent_state_goto_start(&toi_writer_posn);
16353 +       toi_bio_ops.forward_one_page(0); /* To first page */
16354 +
16355 +       for (i = 0; i < header_pages_reserved; i++)
16356 +               if (toi_bio_ops.forward_one_page(0))
16357 +                       return -ENOSPC;
16358 +
16359 +       /* The end of header pages will be the start of pageset 2;
16360 +        * we are now sitting on the first pageset2 page. */
16361 +       toi_extent_state_save(&toi_writer_posn, &toi_writer_posn_save[2]);
16362 +       return 0;
16363 +}
16364 +
16365 +static void toi_swap_reserve_header_space(int request)
16366 +{
16367 +       header_pages_reserved = (long) request;
16368 +
16369 +       /* If we've already allocated storage (hence ignoring return value): */
16370 +       apply_header_reservation();
16371 +}
16372 +
16373 +static void free_block_chains(void)
16374 +{
16375 +       int i;
16376 +
16377 +       for (i = 0; i < MAX_SWAPFILES; i++)
16378 +               if (block_chain[i].first)
16379 +                       toi_put_extent_chain(&block_chain[i]);
16380 +}
16381 +
16382 +static int add_blocks_to_extent_chain(int chain, int start, int end)
16383 +{
16384 +       if (test_action_state(TOI_TEST_BIO))
16385 +               printk(KERN_INFO "Adding extent chain %d %d-%d.\n", chain,
16386 +                               start << devinfo[chain].bmap_shift,
16387 +                               end << devinfo[chain].bmap_shift);
16388 +
16389 +       if (toi_add_to_extent_chain(&block_chain[chain], start, end)) {
16390 +               free_block_chains();
16391 +               return -ENOMEM;
16392 +       }
16393 +
16394 +       return 0;
16395 +}
16396 +
16397 +
16398 +static int get_main_pool_phys_params(void)
16399 +{
16400 +       struct hibernate_extent *extentpointer = NULL;
16401 +       unsigned long address;
16402 +       int extent_min = -1, extent_max = -1, last_chain = -1;
16403 +
16404 +       free_block_chains();
16405 +
16406 +       toi_extent_for_each(&swapextents, extentpointer, address) {
16407 +               swp_entry_t swap_address = (swp_entry_t) { address };
16408 +               pgoff_t offset = swp_offset(swap_address);
16409 +               unsigned swapfilenum = swp_type(swap_address);
16410 +               struct swap_info_struct *sis =
16411 +                       get_swap_info_struct(swapfilenum);
16412 +               sector_t new_sector = map_swap_page(sis, offset);
16413 +
16414 +               if ((new_sector == extent_max + 1) &&
16415 +                   (last_chain == swapfilenum)) {
16416 +                       extent_max++;
16417 +                       continue;
16418 +               }
16419 +
16420 +               if (extent_min > -1 && add_blocks_to_extent_chain(last_chain,
16421 +                                       extent_min, extent_max))
16422 +                       return -ENOMEM;
16423 +
16424 +               extent_min = extent_max = new_sector;
16425 +               last_chain = swapfilenum;
16426 +       }
16427 +
16428 +       if (extent_min > -1 && add_blocks_to_extent_chain(last_chain,
16429 +                               extent_min, extent_max))
16430 +                       return -ENOMEM;
16431 +
16432 +       return apply_header_reservation();
16433 +}
16434 +
16435 +static long raw_to_real(long raw)
16436 +{
16437 +       long result;
16438 +
16439 +       result = raw - (raw * (sizeof(unsigned long) + sizeof(int)) +
16440 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
16441 +               (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
16442 +
16443 +       return result < 0 ? 0 : result;
16444 +}
16445 +
16446 +static int toi_swap_storage_allocated(void)
16447 +{
16448 +       return (int) raw_to_real(swap_pages_allocated - header_pages_reserved);
16449 +}
16450 +
16451 +/*
16452 + * We can't just remember the value from allocation time, because other
16453 + * processes might have allocated swap in the mean time.
16454 + */
16455 +static int toi_swap_storage_available(void)
16456 +{
16457 +       si_swapinfo(&swapinfo);
16458 +       return (int) raw_to_real((long) swapinfo.freeswap +
16459 +                       swap_pages_allocated - header_pages_reserved);
16460 +}
16461 +
16462 +static int toi_swap_initialise(int starting_cycle)
16463 +{
16464 +       if (!starting_cycle)
16465 +               return 0;
16466 +
16467 +       enable_swapfile();
16468 +
16469 +       if (resume_swap_dev_t && !resume_block_device &&
16470 +           IS_ERR(resume_block_device =
16471 +                       open_bdev(MAX_SWAPFILES, resume_swap_dev_t, 1)))
16472 +               return 1;
16473 +
16474 +       return 0;
16475 +}
16476 +
16477 +static void toi_swap_cleanup(int ending_cycle)
16478 +{
16479 +       if (ending_cycle)
16480 +               disable_swapfile();
16481 +
16482 +       close_bdevs();
16483 +
16484 +       forget_signatures();
16485 +}
16486 +
16487 +static int toi_swap_release_storage(void)
16488 +{
16489 +       if (test_action_state(TOI_KEEP_IMAGE) &&
16490 +           test_toi_state(TOI_NOW_RESUMING))
16491 +               return 0;
16492 +
16493 +       header_pages_reserved = 0;
16494 +       swap_pages_allocated = 0;
16495 +
16496 +       if (swapextents.first) {
16497 +               /* Free swap entries */
16498 +               struct hibernate_extent *extentpointer;
16499 +               unsigned long extentvalue;
16500 +               toi_extent_for_each(&swapextents, extentpointer,
16501 +                               extentvalue)
16502 +                       swap_free((swp_entry_t) { extentvalue });
16503 +
16504 +               toi_put_extent_chain(&swapextents);
16505 +
16506 +               free_block_chains();
16507 +       }
16508 +
16509 +       return 0;
16510 +}
16511 +
16512 +static void free_swap_range(unsigned long min, unsigned long max)
16513 +{
16514 +       int j;
16515 +
16516 +       for (j = min; j <= max; j++)
16517 +               swap_free((swp_entry_t) { j });
16518 +}
16519 +
16520 +/*
16521 + * Round robin allocation (where swap storage has the same priority).
16522 + * could make this very inefficient, so we track extents allocated on
16523 + * a per-swapfile basis.
16524 + *
16525 + * We ignore here the fact that some space is for the header and doesn't
16526 + * have the overhead. It will only rarely make a 1 page difference.
16527 + */
16528 +static int toi_swap_allocate_storage(int request)
16529 +{
16530 +       int i, result = 0, to_add[MAX_SWAPFILES], pages_to_get, extra_pages,
16531 +           gotten = 0;
16532 +       unsigned long extent_min[MAX_SWAPFILES], extent_max[MAX_SWAPFILES];
16533 +
16534 +       extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
16535 +                              + sizeof(int)), PAGE_SIZE);
16536 +       pages_to_get = request + extra_pages - swapextents.size;
16537 +
16538 +       if (pages_to_get < 1)
16539 +               return 0;
16540 +
16541 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16542 +               struct swap_info_struct *si = get_swap_info_struct(i);
16543 +               to_add[i] = 0;
16544 +               if (!si->bdev)
16545 +                       continue;
16546 +               devinfo[i].bdev = si->bdev;
16547 +               devinfo[i].dev_t = si->bdev->bd_dev;
16548 +               devinfo[i].bmap_shift = 3;
16549 +               devinfo[i].blocks_per_page = 1;
16550 +       }
16551 +
16552 +       for (i = 0; i < pages_to_get; i++) {
16553 +               swp_entry_t entry;
16554 +               unsigned long new_value;
16555 +               unsigned swapfilenum;
16556 +
16557 +               entry = get_swap_page();
16558 +               if (!entry.val)
16559 +                       break;
16560 +
16561 +               swapfilenum = swp_type(entry);
16562 +               new_value = entry.val;
16563 +
16564 +               if (!to_add[swapfilenum]) {
16565 +                       to_add[swapfilenum] = 1;
16566 +                       extent_min[swapfilenum] = new_value;
16567 +                       extent_max[swapfilenum] = new_value;
16568 +                       gotten++;
16569 +                       continue;
16570 +               }
16571 +
16572 +               if (new_value == extent_max[swapfilenum] + 1) {
16573 +                       extent_max[swapfilenum]++;
16574 +                       gotten++;
16575 +                       continue;
16576 +               }
16577 +
16578 +               if (toi_add_to_extent_chain(&swapextents,
16579 +                                       extent_min[swapfilenum],
16580 +                                       extent_max[swapfilenum])) {
16581 +                       printk(KERN_INFO "Failed to allocate extent for "
16582 +                                       "%lu-%lu.\n", extent_min[swapfilenum],
16583 +                                       extent_max[swapfilenum]);
16584 +                       free_swap_range(extent_min[swapfilenum],
16585 +                                       extent_max[swapfilenum]);
16586 +                       swap_free(entry);
16587 +                       gotten -= (extent_max[swapfilenum] -
16588 +                                       extent_min[swapfilenum] + 1);
16589 +                       /* Don't try to add again below */
16590 +                       to_add[swapfilenum] = 0;
16591 +                       break;
16592 +               } else {
16593 +                       extent_min[swapfilenum] = new_value;
16594 +                       extent_max[swapfilenum] = new_value;
16595 +                       gotten++;
16596 +               }
16597 +       }
16598 +
16599 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16600 +               if (!to_add[i] || !toi_add_to_extent_chain(&swapextents,
16601 +                                       extent_min[i], extent_max[i]))
16602 +                       continue;
16603 +
16604 +               free_swap_range(extent_min[i], extent_max[i]);
16605 +               gotten -= (extent_max[i] - extent_min[i] + 1);
16606 +               break;
16607 +       }
16608 +
16609 +       if (gotten < pages_to_get)
16610 +               result = -ENOSPC;
16611 +
16612 +       swap_pages_allocated += (long) gotten;
16613 +
16614 +       return result ? result : get_main_pool_phys_params();
16615 +}
16616 +
16617 +static int toi_swap_write_header_init(void)
16618 +{
16619 +       int i, result;
16620 +       struct swap_info_struct *si;
16621 +
16622 +       toi_bio_ops.rw_init(WRITE, 0);
16623 +       toi_writer_buffer_posn = 0;
16624 +
16625 +       /* Info needed to bootstrap goes at the start of the header.
16626 +        * First we save the positions and devinfo, including the number
16627 +        * of header pages. Then we save the structs containing data needed
16628 +        * for reading the header pages back.
16629 +        * Note that even if header pages take more than one page, when we
16630 +        * read back the info, we will have restored the location of the
16631 +        * next header page by the time we go to use it.
16632 +        */
16633 +
16634 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
16635 +                       (char *) &no_image_signature_contents,
16636 +                       sizeof(struct sig_data));
16637 +
16638 +       if (result)
16639 +               return result;
16640 +
16641 +       /* Forward one page will be done prior to the read */
16642 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16643 +               si = get_swap_info_struct(i);
16644 +               if (si->swap_file)
16645 +                       devinfo[i].dev_t = si->bdev->bd_dev;
16646 +               else
16647 +                       devinfo[i].dev_t = (dev_t) 0;
16648 +       }
16649 +
16650 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
16651 +                       (char *) &toi_writer_posn_save,
16652 +                       sizeof(toi_writer_posn_save));
16653 +
16654 +       if (result)
16655 +               return result;
16656 +
16657 +       result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
16658 +                       (char *) &devinfo, sizeof(devinfo));
16659 +
16660 +       if (result)
16661 +               return result;
16662 +
16663 +       for (i = 0; i < MAX_SWAPFILES; i++)
16664 +               toi_serialise_extent_chain(&toi_swapops, &block_chain[i]);
16665 +
16666 +       return 0;
16667 +}
16668 +
16669 +static int toi_swap_write_header_cleanup(void)
16670 +{
16671 +       /* Write any unsaved data */
16672 +       if (toi_writer_buffer_posn)
16673 +               toi_bio_ops.write_header_chunk_finish();
16674 +
16675 +       toi_bio_ops.finish_all_io();
16676 +
16677 +       /* Set signature to save we have an image */
16678 +       return write_modified_signature(IMAGE_SIGNATURE);
16679 +}
16680 +
16681 +/* ------------------------- HEADER READING ------------------------- */
16682 +
16683 +/*
16684 + * read_header_init()
16685 + *
16686 + * Description:
16687 + * 1. Attempt to read the device specified with resume=.
16688 + * 2. Check the contents of the swap header for our signature.
16689 + * 3. Warn, ignore, reset and/or continue as appropriate.
16690 + * 4. If continuing, read the toi_swap configuration section
16691 + *    of the header and set up block device info so we can read
16692 + *    the rest of the header & image.
16693 + *
16694 + * Returns:
16695 + * May not return if user choose to reboot at a warning.
16696 + * -EINVAL if cannot resume at this time. Booting should continue
16697 + * normally.
16698 + */
16699 +
16700 +static int toi_swap_read_header_init(void)
16701 +{
16702 +       int i, result = 0;
16703 +       toi_writer_buffer_posn = 0;
16704 +
16705 +       if (!header_dev_t) {
16706 +               printk(KERN_INFO "read_header_init called when we haven't "
16707 +                               "verified there is an image!\n");
16708 +               return -EINVAL;
16709 +       }
16710 +
16711 +       /*
16712 +        * If the header is not on the resume_swap_dev_t, get the resume device
16713 +        * first.
16714 +        */
16715 +       if (header_dev_t != resume_swap_dev_t) {
16716 +               header_block_device = open_bdev(MAX_SWAPFILES + 1,
16717 +                               header_dev_t, 1);
16718 +
16719 +               if (IS_ERR(header_block_device))
16720 +                       return PTR_ERR(header_block_device);
16721 +       } else
16722 +               header_block_device = resume_block_device;
16723 +
16724 +       toi_bio_ops.read_header_init();
16725 +
16726 +       /*
16727 +        * Read toi_swap configuration.
16728 +        * Headerblock size taken into account already.
16729 +        */
16730 +       result = toi_bio_ops.bdev_page_io(READ, header_block_device,
16731 +                       headerblock << 3,
16732 +                       virt_to_page((unsigned long) toi_writer_buffer));
16733 +       if (result)
16734 +               return result;
16735 +
16736 +       memcpy(&no_image_signature_contents, toi_writer_buffer,
16737 +                       sizeof(no_image_signature_contents));
16738 +
16739 +       toi_writer_buffer_posn = sizeof(no_image_signature_contents);
16740 +
16741 +       memcpy(&toi_writer_posn_save, toi_writer_buffer +
16742 +                       toi_writer_buffer_posn, sizeof(toi_writer_posn_save));
16743 +
16744 +       toi_writer_buffer_posn += sizeof(toi_writer_posn_save);
16745 +
16746 +       memcpy(&devinfo, toi_writer_buffer + toi_writer_buffer_posn,
16747 +                       sizeof(devinfo));
16748 +
16749 +       toi_writer_buffer_posn += sizeof(devinfo);
16750 +
16751 +       /* Restore device info */
16752 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16753 +               dev_t thisdevice = devinfo[i].dev_t;
16754 +               struct block_device *result;
16755 +
16756 +               devinfo[i].bdev = NULL;
16757 +
16758 +               if (!thisdevice)
16759 +                       continue;
16760 +
16761 +               if (thisdevice == resume_swap_dev_t) {
16762 +                       devinfo[i].bdev = resume_block_device;
16763 +                       continue;
16764 +               }
16765 +
16766 +               if (thisdevice == header_dev_t) {
16767 +                       devinfo[i].bdev = header_block_device;
16768 +                       continue;
16769 +               }
16770 +
16771 +               result = open_bdev(i, thisdevice, 1);
16772 +               if (IS_ERR(result))
16773 +                       return PTR_ERR(result);
16774 +               devinfo[i].bdev = bdevs_opened[i]->bdev;
16775 +       }
16776 +
16777 +       toi_extent_state_goto_start(&toi_writer_posn);
16778 +       toi_bio_ops.set_extra_page_forward();
16779 +
16780 +       for (i = 0; i < MAX_SWAPFILES && !result; i++)
16781 +               result = toi_load_extent_chain(&block_chain[i]);
16782 +
16783 +       return result;
16784 +}
16785 +
16786 +static int toi_swap_read_header_cleanup(void)
16787 +{
16788 +       toi_bio_ops.rw_cleanup(READ);
16789 +       return 0;
16790 +}
16791 +
16792 +/*
16793 + * workspace_size
16794 + *
16795 + * Description:
16796 + * Returns the number of bytes of RAM needed for this
16797 + * code to do its work. (Used when calculating whether
16798 + * we have enough memory to be able to hibernate & resume).
16799 + *
16800 + */
16801 +static int toi_swap_memory_needed(void)
16802 +{
16803 +       return 1;
16804 +}
16805 +
16806 +/*
16807 + * Print debug info
16808 + *
16809 + * Description:
16810 + */
16811 +static int toi_swap_print_debug_stats(char *buffer, int size)
16812 +{
16813 +       int len = 0;
16814 +       struct sysinfo sysinfo;
16815 +
16816 +       if (toiActiveAllocator != &toi_swapops) {
16817 +               len = snprintf_used(buffer, size,
16818 +                               "- SwapAllocator inactive.\n");
16819 +               return len;
16820 +       }
16821 +
16822 +       len = snprintf_used(buffer, size, "- SwapAllocator active.\n");
16823 +       if (swapfilename[0])
16824 +               len += snprintf_used(buffer+len, size-len,
16825 +                       "  Attempting to automatically swapon: %s.\n",
16826 +                       swapfilename);
16827 +
16828 +       si_swapinfo(&sysinfo);
16829 +
16830 +       len += snprintf_used(buffer+len, size-len,
16831 +                       "  Swap available for image: %ld pages.\n",
16832 +                       (int) sysinfo.freeswap + toi_swap_storage_allocated());
16833 +
16834 +       return len;
16835 +}
16836 +
16837 +/*
16838 + * Storage needed
16839 + *
16840 + * Returns amount of space in the swap header required
16841 + * for the toi_swap's data. This ignores the links between
16842 + * pages, which we factor in when allocating the space.
16843 + *
16844 + * We ensure the space is allocated, but actually save the
16845 + * data from write_header_init and therefore don't also define a
16846 + * save_config_info routine.
16847 + */
16848 +static int toi_swap_storage_needed(void)
16849 +{
16850 +       int i, result;
16851 +       result = sizeof(toi_writer_posn_save) + sizeof(devinfo);
16852 +
16853 +       for (i = 0; i < MAX_SWAPFILES; i++) {
16854 +               result += 3 * sizeof(int);
16855 +               result += (2 * sizeof(unsigned long) *
16856 +                       block_chain[i].num_extents);
16857 +       }
16858 +
16859 +       return result;
16860 +}
16861 +
16862 +/*
16863 + * Image_exists
16864 + *
16865 + * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
16866 + */
16867 +static int toi_swap_image_exists(int quiet)
16868 +{
16869 +       int signature_found;
16870 +
16871 +       if (!resume_swap_dev_t) {
16872 +               if (!quiet)
16873 +                       printk(KERN_INFO "Not even trying to read header "
16874 +                               "because resume_swap_dev_t is not set.\n");
16875 +               return -1;
16876 +       }
16877 +
16878 +       if (!resume_block_device &&
16879 +           IS_ERR(resume_block_device =
16880 +                       open_bdev(MAX_SWAPFILES, resume_swap_dev_t, 1))) {
16881 +               if (!quiet)
16882 +                       printk(KERN_INFO "Failed to open resume dev_t (%x).\n",
16883 +                               resume_swap_dev_t);
16884 +               return -1;
16885 +       }
16886 +
16887 +       signature_found = parse_signature();
16888 +
16889 +       switch (signature_found) {
16890 +       case -ENOMEM:
16891 +               return -1;
16892 +       case -1:
16893 +               if (!quiet)
16894 +                       printk(KERN_ERR "TuxOnIce: Unable to find a signature."
16895 +                               " Could you have moved a swap file?\n");
16896 +               return -1;
16897 +       case 0:
16898 +       case 1:
16899 +               if (!quiet)
16900 +                       printk(KERN_INFO "TuxOnIce: Normal swapspace found.\n");
16901 +               return 0;
16902 +       case 2:
16903 +       case 3:
16904 +       case 4:
16905 +               if (!quiet)
16906 +                       printk(KERN_INFO "TuxOnIce: Detected another "
16907 +                               "implementation's signature.\n");
16908 +               return 0;
16909 +       case 10:
16910 +               if (!quiet)
16911 +                       printk(KERN_INFO "TuxOnIce: Detected TuxOnIce binary "
16912 +                               "signature.\n");
16913 +               return 1;
16914 +       }
16915 +
16916 +       BUG();
16917 +       return 0;
16918 +}
16919 +
16920 +/* toi_swap_remove_image
16921 + *
16922 + */
16923 +static int toi_swap_remove_image(void)
16924 +{
16925 +       /*
16926 +        * If nr_hibernates == 0, we must be booting, so no swap pages
16927 +        * will be recorded as used yet.
16928 +        */
16929 +
16930 +       if (nr_hibernates)
16931 +               toi_swap_release_storage();
16932 +
16933 +       /*
16934 +        * We don't do a sanity check here: we want to restore the swap
16935 +        * whatever version of kernel made the hibernate image.
16936 +        *
16937 +        * We need to write swap, but swap may not be enabled so
16938 +        * we write the device directly
16939 +        *
16940 +        * If we don't have an current_signature_page, we didn't
16941 +        * read an image header, so don't change anything.
16942 +        */
16943 +
16944 +       return toi_swap_image_exists(1) ?
16945 +               write_modified_signature(NO_IMAGE_SIGNATURE) : 0;
16946 +}
16947 +
16948 +/*
16949 + * Mark resume attempted.
16950 + *
16951 + * Record that we tried to resume from this image. We have already read the
16952 + * signature in. We just need to write the modified version.
16953 + */
16954 +static int toi_swap_mark_resume_attempted(int mark)
16955 +{
16956 +       if (!resume_swap_dev_t) {
16957 +               printk(KERN_INFO "Not even trying to record attempt at resuming"
16958 +                               " because resume_swap_dev_t is not set.\n");
16959 +               return -ENODEV;
16960 +       }
16961 +
16962 +       return write_modified_signature(mark ? TRIED_RESUME : NO_TRIED_RESUME);
16963 +}
16964 +
16965 +/*
16966 + * Parse Image Location
16967 + *
16968 + * Attempt to parse a resume= parameter.
16969 + * Swap Writer accepts:
16970 + * resume=swap:DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
16971 + *
16972 + * Where:
16973 + * DEVNAME is convertable to a dev_t by name_to_dev_t
16974 + * FIRSTBLOCK is the location of the first block in the swap file
16975 + * (specifying for a swap partition is nonsensical but not prohibited).
16976 + * Data is validated by attempting to read a swap header from the
16977 + * location given. Failure will result in toi_swap refusing to
16978 + * save an image, and a reboot with correct parameters will be
16979 + * necessary.
16980 + */
16981 +static int toi_swap_parse_sig_location(char *commandline,
16982 +               int only_allocator, int quiet)
16983 +{
16984 +       char *thischar, *devstart, *colon = NULL;
16985 +       int signature_found, result = -EINVAL, temp_result;
16986 +
16987 +       if (strncmp(commandline, "swap:", 5)) {
16988 +               /*
16989 +                * Failing swap:, we'll take a simple
16990 +                * resume=/dev/hda2, but fall through to
16991 +                * other allocators if /dev/ isn't matched.
16992 +                */
16993 +               if (strncmp(commandline, "/dev/", 5))
16994 +                       return 1;
16995 +       } else
16996 +               commandline += 5;
16997 +
16998 +       devstart = thischar = commandline;
16999 +       while ((*thischar != ':') && (*thischar != '@') &&
17000 +               ((thischar - commandline) < 250) && (*thischar))
17001 +               thischar++;
17002 +
17003 +       if (*thischar == ':') {
17004 +               colon = thischar;
17005 +               *colon = 0;
17006 +               thischar++;
17007 +       }
17008 +
17009 +       while ((thischar - commandline) < 250 && *thischar)
17010 +               thischar++;
17011 +
17012 +       if (colon)
17013 +               resume_firstblock = (int) simple_strtoul(colon + 1, NULL, 0);
17014 +       else
17015 +               resume_firstblock = 0;
17016 +
17017 +       clear_toi_state(TOI_CAN_HIBERNATE);
17018 +       clear_toi_state(TOI_CAN_RESUME);
17019 +
17020 +       temp_result = try_to_parse_resume_device(devstart, quiet);
17021 +
17022 +       if (colon)
17023 +               *colon = ':';
17024 +
17025 +       if (temp_result)
17026 +               return -EINVAL;
17027 +
17028 +       signature_found = toi_swap_image_exists(quiet);
17029 +
17030 +       if (signature_found != -1) {
17031 +               result = 0;
17032 +
17033 +               toi_bio_ops.set_devinfo(devinfo);
17034 +               toi_writer_posn.chains = &block_chain[0];
17035 +               toi_writer_posn.num_chains = MAX_SWAPFILES;
17036 +               set_toi_state(TOI_CAN_HIBERNATE);
17037 +               set_toi_state(TOI_CAN_RESUME);
17038 +       } else
17039 +               if (!quiet)
17040 +                       printk(KERN_ERR "TuxOnIce: SwapAllocator: No swap "
17041 +                               "signature found at %s.\n", devstart);
17042 +       return result;
17043 +}
17044 +
17045 +static int header_locations_read_sysfs(const char *page, int count)
17046 +{
17047 +       int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
17048 +       struct inode *swapf = 0;
17049 +       int zone;
17050 +       char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
17051 +       char *path, *output = (char *) page;
17052 +       int path_len;
17053 +
17054 +       if (!page)
17055 +               return 0;
17056 +
17057 +       for (i = 0; i < MAX_SWAPFILES; i++) {
17058 +               struct swap_info_struct *si =  get_swap_info_struct(i);
17059 +
17060 +               if (!si->swap_file)
17061 +                       continue;
17062 +
17063 +               if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
17064 +                       haveswap = 1;
17065 +                       if (!printedpartitionsmessage) {
17066 +                               len += sprintf(output + len,
17067 +                                       "For swap partitions, simply use the "
17068 +                                       "format: resume=swap:/dev/hda1.\n");
17069 +                               printedpartitionsmessage = 1;
17070 +                       }
17071 +               } else {
17072 +                       path_len = 0;
17073 +
17074 +                       path = d_path(&si->swap_file->f_path, path_page,
17075 +                                       PAGE_SIZE);
17076 +                       path_len = snprintf(path_page, 31, "%s", path);
17077 +
17078 +                       haveswap = 1;
17079 +                       swapf = si->swap_file->f_mapping->host;
17080 +                       zone = bmap(swapf, 0);
17081 +                       if (!zone) {
17082 +                               len += sprintf(output + len,
17083 +                                       "Swapfile %s has been corrupted. Reuse"
17084 +                                       " mkswap on it and try again.\n",
17085 +                                       path_page);
17086 +                       } else {
17087 +                               char name_buffer[255];
17088 +                               len += sprintf(output + len,
17089 +                                       "For swapfile `%s`,"
17090 +                                       " use resume=swap:/dev/%s:0x%x.\n",
17091 +                                       path_page,
17092 +                                       bdevname(si->bdev, name_buffer),
17093 +                                       zone << (swapf->i_blkbits - 9));
17094 +                       }
17095 +               }
17096 +       }
17097 +
17098 +       if (!haveswap)
17099 +               len = sprintf(output, "You need to turn on swap partitions "
17100 +                               "before examining this file.\n");
17101 +
17102 +       toi_free_page(10, (unsigned long) path_page);
17103 +       return len;
17104 +}
17105 +
17106 +static struct toi_sysfs_data sysfs_params[] = {
17107 +       {
17108 +        TOI_ATTR("swapfilename", SYSFS_RW),
17109 +        SYSFS_STRING(swapfilename, 255, 0)
17110 +       },
17111 +
17112 +       {
17113 +        TOI_ATTR("headerlocations", SYSFS_READONLY),
17114 +        SYSFS_CUSTOM(header_locations_read_sysfs, NULL, 0)
17115 +       },
17116 +
17117 +       { TOI_ATTR("enabled", SYSFS_RW),
17118 +         SYSFS_INT(&toi_swapops.enabled, 0, 1, 0),
17119 +         .write_side_effect            = attempt_to_parse_resume_device2,
17120 +       }
17121 +};
17122 +
17123 +static struct toi_module_ops toi_swapops = {
17124 +       .type                                   = WRITER_MODULE,
17125 +       .name                                   = "swap storage",
17126 +       .directory                              = "swap",
17127 +       .module                                 = THIS_MODULE,
17128 +       .memory_needed                          = toi_swap_memory_needed,
17129 +       .print_debug_info                       = toi_swap_print_debug_stats,
17130 +       .storage_needed                         = toi_swap_storage_needed,
17131 +       .initialise                             = toi_swap_initialise,
17132 +       .cleanup                                = toi_swap_cleanup,
17133 +
17134 +       .noresume_reset         = toi_swap_noresume_reset,
17135 +       .storage_available      = toi_swap_storage_available,
17136 +       .storage_allocated      = toi_swap_storage_allocated,
17137 +       .release_storage        = toi_swap_release_storage,
17138 +       .reserve_header_space   = toi_swap_reserve_header_space,
17139 +       .allocate_storage       = toi_swap_allocate_storage,
17140 +       .image_exists           = toi_swap_image_exists,
17141 +       .mark_resume_attempted  = toi_swap_mark_resume_attempted,
17142 +       .write_header_init      = toi_swap_write_header_init,
17143 +       .write_header_cleanup   = toi_swap_write_header_cleanup,
17144 +       .read_header_init       = toi_swap_read_header_init,
17145 +       .read_header_cleanup    = toi_swap_read_header_cleanup,
17146 +       .remove_image           = toi_swap_remove_image,
17147 +       .parse_sig_location     = toi_swap_parse_sig_location,
17148 +
17149 +       .sysfs_data             = sysfs_params,
17150 +       .num_sysfs_entries      = sizeof(sysfs_params) /
17151 +               sizeof(struct toi_sysfs_data),
17152 +};
17153 +
17154 +/* ---- Registration ---- */
17155 +static __init int toi_swap_load(void)
17156 +{
17157 +       toi_swapops.rw_init = toi_bio_ops.rw_init;
17158 +       toi_swapops.rw_cleanup = toi_bio_ops.rw_cleanup;
17159 +       toi_swapops.read_page = toi_bio_ops.read_page;
17160 +       toi_swapops.write_page = toi_bio_ops.write_page;
17161 +       toi_swapops.rw_header_chunk = toi_bio_ops.rw_header_chunk;
17162 +       toi_swapops.rw_header_chunk_noreadahead =
17163 +               toi_bio_ops.rw_header_chunk_noreadahead;
17164 +       toi_swapops.io_flusher = toi_bio_ops.io_flusher;
17165 +
17166 +       return toi_register_module(&toi_swapops);
17167 +}
17168 +
17169 +#ifdef MODULE
17170 +static __exit void toi_swap_unload(void)
17171 +{
17172 +       toi_unregister_module(&toi_swapops);
17173 +}
17174 +
17175 +module_init(toi_swap_load);
17176 +module_exit(toi_swap_unload);
17177 +MODULE_LICENSE("GPL");
17178 +MODULE_AUTHOR("Nigel Cunningham");
17179 +MODULE_DESCRIPTION("TuxOnIce SwapAllocator");
17180 +#else
17181 +late_initcall(toi_swap_load);
17182 +#endif
17183 diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
17184 new file mode 100644
17185 index 0000000..01e98d0
17186 --- /dev/null
17187 +++ b/kernel/power/tuxonice_sysfs.c
17188 @@ -0,0 +1,335 @@
17189 +/*
17190 + * kernel/power/tuxonice_sysfs.c
17191 + *
17192 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
17193 + *
17194 + * This file is released under the GPLv2.
17195 + *
17196 + * This file contains support for sysfs entries for tuning TuxOnIce.
17197 + *
17198 + * We have a generic handler that deals with the most common cases, and
17199 + * hooks for special handlers to use.
17200 + */
17201 +
17202 +#include <linux/suspend.h>
17203 +#include <linux/module.h>
17204 +
17205 +#include "tuxonice_sysfs.h"
17206 +#include "tuxonice.h"
17207 +#include "tuxonice_storage.h"
17208 +#include "tuxonice_alloc.h"
17209 +
17210 +static int toi_sysfs_initialised;
17211 +
17212 +static void toi_initialise_sysfs(void);
17213 +
17214 +static struct toi_sysfs_data sysfs_params[];
17215 +
17216 +#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
17217 +
17218 +static void toi_main_wrapper(void)
17219 +{
17220 +       _toi_try_hibernate(0);
17221 +}
17222 +
17223 +static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
17224 +                             char *page)
17225 +{
17226 +       struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
17227 +       int len = 0;
17228 +
17229 +       if (toi_start_anything(0))
17230 +               return -EBUSY;
17231 +
17232 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
17233 +               toi_prepare_usm();
17234 +
17235 +       switch (sysfs_data->type) {
17236 +       case TOI_SYSFS_DATA_CUSTOM:
17237 +               len = (sysfs_data->data.special.read_sysfs) ?
17238 +                       (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
17239 +                       : 0;
17240 +               break;
17241 +       case TOI_SYSFS_DATA_BIT:
17242 +               len = sprintf(page, "%d\n",
17243 +                       -test_bit(sysfs_data->data.bit.bit,
17244 +                               sysfs_data->data.bit.bit_vector));
17245 +               break;
17246 +       case TOI_SYSFS_DATA_INTEGER:
17247 +               len = sprintf(page, "%d\n",
17248 +                       *(sysfs_data->data.integer.variable));
17249 +               break;
17250 +       case TOI_SYSFS_DATA_LONG:
17251 +               len = sprintf(page, "%ld\n",
17252 +                       *(sysfs_data->data.a_long.variable));
17253 +               break;
17254 +       case TOI_SYSFS_DATA_UL:
17255 +               len = sprintf(page, "%lu\n",
17256 +                       *(sysfs_data->data.ul.variable));
17257 +               break;
17258 +       case TOI_SYSFS_DATA_STRING:
17259 +               len = sprintf(page, "%s\n",
17260 +                       sysfs_data->data.string.variable);
17261 +               break;
17262 +       }
17263 +       /* Side effect routine? */
17264 +       if (sysfs_data->read_side_effect)
17265 +               sysfs_data->read_side_effect();
17266 +
17267 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
17268 +               toi_cleanup_usm();
17269 +
17270 +       toi_finish_anything(0);
17271 +
17272 +       return len;
17273 +}
17274 +
17275 +#define BOUND(_variable, _type) \
17276 +       do { \
17277 +       if (*_variable < sysfs_data->data._type.minimum) \
17278 +               *_variable = sysfs_data->data._type.minimum; \
17279 +       else if (*_variable > sysfs_data->data._type.maximum) \
17280 +               *_variable = sysfs_data->data._type.maximum; \
17281 +       } while (0)
17282 +
17283 +static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
17284 +               const char *my_buf, size_t count)
17285 +{
17286 +       int assigned_temp_buffer = 0, result = count;
17287 +       struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
17288 +
17289 +       if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
17290 +               return -EBUSY;
17291 +
17292 +       ((char *) my_buf)[count] = 0;
17293 +
17294 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
17295 +               toi_prepare_usm();
17296 +
17297 +       switch (sysfs_data->type) {
17298 +       case TOI_SYSFS_DATA_CUSTOM:
17299 +               if (sysfs_data->data.special.write_sysfs)
17300 +                       result = (sysfs_data->data.special.write_sysfs)(my_buf,
17301 +                                       count);
17302 +               break;
17303 +       case TOI_SYSFS_DATA_BIT:
17304 +               {
17305 +               int value = simple_strtoul(my_buf, NULL, 0);
17306 +               if (value)
17307 +                       set_bit(sysfs_data->data.bit.bit,
17308 +                               (sysfs_data->data.bit.bit_vector));
17309 +               else
17310 +                       clear_bit(sysfs_data->data.bit.bit,
17311 +                               (sysfs_data->data.bit.bit_vector));
17312 +               }
17313 +               break;
17314 +       case TOI_SYSFS_DATA_INTEGER:
17315 +               {
17316 +                       int *variable =
17317 +                               sysfs_data->data.integer.variable;
17318 +                       *variable = simple_strtol(my_buf, NULL, 0);
17319 +                       BOUND(variable, integer);
17320 +                       break;
17321 +               }
17322 +       case TOI_SYSFS_DATA_LONG:
17323 +               {
17324 +                       long *variable =
17325 +                               sysfs_data->data.a_long.variable;
17326 +                       *variable = simple_strtol(my_buf, NULL, 0);
17327 +                       BOUND(variable, a_long);
17328 +                       break;
17329 +               }
17330 +       case TOI_SYSFS_DATA_UL:
17331 +               {
17332 +                       unsigned long *variable =
17333 +                               sysfs_data->data.ul.variable;
17334 +                       *variable = simple_strtoul(my_buf, NULL, 0);
17335 +                       BOUND(variable, ul);
17336 +                       break;
17337 +               }
17338 +               break;
17339 +       case TOI_SYSFS_DATA_STRING:
17340 +               {
17341 +                       int copy_len = count;
17342 +                       char *variable =
17343 +                               sysfs_data->data.string.variable;
17344 +
17345 +                       if (sysfs_data->data.string.max_length &&
17346 +                           (copy_len > sysfs_data->data.string.max_length))
17347 +                               copy_len = sysfs_data->data.string.max_length;
17348 +
17349 +                       if (!variable) {
17350 +                               variable = (char *) toi_get_zeroed_page(31,
17351 +                                               TOI_ATOMIC_GFP);
17352 +                               sysfs_data->data.string.variable = variable;
17353 +                               assigned_temp_buffer = 1;
17354 +                       }
17355 +                       strncpy(variable, my_buf, copy_len);
17356 +                       if (copy_len && my_buf[copy_len - 1] == '\n')
17357 +                               variable[count - 1] = 0;
17358 +                       variable[count] = 0;
17359 +               }
17360 +               break;
17361 +       }
17362 +
17363 +       /* Side effect routine? */
17364 +       if (sysfs_data->write_side_effect)
17365 +               sysfs_data->write_side_effect();
17366 +
17367 +       /* Free temporary buffers */
17368 +       if (assigned_temp_buffer) {
17369 +               toi_free_page(31,
17370 +                       (unsigned long) sysfs_data->data.string.variable);
17371 +               sysfs_data->data.string.variable = NULL;
17372 +       }
17373 +
17374 +       if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
17375 +               toi_cleanup_usm();
17376 +
17377 +       toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
17378 +
17379 +       return result;
17380 +}
17381 +
17382 +static struct sysfs_ops toi_sysfs_ops = {
17383 +       .show   = &toi_attr_show,
17384 +       .store  = &toi_attr_store,
17385 +};
17386 +
17387 +static struct kobj_type toi_ktype = {
17388 +       .sysfs_ops      = &toi_sysfs_ops,
17389 +};
17390 +
17391 +struct kobject *tuxonice_kobj;
17392 +
17393 +/* Non-module sysfs entries.
17394 + *
17395 + * This array contains entries that are automatically registered at
17396 + * boot. Modules and the console code register their own entries separately.
17397 + *
17398 + * NB: If you move do_hibernate, change toi_write_sysfs's test so that
17399 + * toi_start_anything still gets a 1 when the user echos > do_hibernate!
17400 + */
17401 +
17402 +static struct toi_sysfs_data sysfs_params[] = {
17403 +       { TOI_ATTR("do_hibernate", SYSFS_WRITEONLY),
17404 +         SYSFS_CUSTOM(NULL, NULL, SYSFS_HIBERNATING),
17405 +         .write_side_effect = toi_main_wrapper
17406 +       },
17407 +
17408 +       { TOI_ATTR("do_resume", SYSFS_WRITEONLY),
17409 +         SYSFS_CUSTOM(NULL, NULL, SYSFS_RESUMING),
17410 +         .write_side_effect = __toi_try_resume
17411 +       },
17412 +
17413 +};
17414 +
17415 +void remove_toi_sysdir(struct kobject *kobj)
17416 +{
17417 +       if (!kobj)
17418 +               return;
17419 +
17420 +       kobject_put(kobj);
17421 +}
17422 +
17423 +struct kobject *make_toi_sysdir(char *name)
17424 +{
17425 +       struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
17426 +
17427 +       if (!kobj) {
17428 +               printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
17429 +                               "dir!\n");
17430 +               return NULL;
17431 +       }
17432 +
17433 +       kobj->ktype = &toi_ktype;
17434 +
17435 +       return kobj;
17436 +}
17437 +
17438 +/* toi_register_sysfs_file
17439 + *
17440 + * Helper for registering a new /sysfs/tuxonice entry.
17441 + */
17442 +
17443 +int toi_register_sysfs_file(
17444 +               struct kobject *kobj,
17445 +               struct toi_sysfs_data *toi_sysfs_data)
17446 +{
17447 +       int result;
17448 +
17449 +       if (!toi_sysfs_initialised)
17450 +               toi_initialise_sysfs();
17451 +
17452 +       result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
17453 +       if (result)
17454 +               printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
17455 +                       "returned %d.\n",
17456 +                       toi_sysfs_data->attr.name, result);
17457 +       kobj->ktype = &toi_ktype;
17458 +
17459 +       return result;
17460 +}
17461 +EXPORT_SYMBOL_GPL(toi_register_sysfs_file);
17462 +
17463 +/* toi_unregister_sysfs_file
17464 + *
17465 + * Helper for removing unwanted /sys/power/tuxonice entries.
17466 + *
17467 + */
17468 +void toi_unregister_sysfs_file(struct kobject *kobj,
17469 +               struct toi_sysfs_data *toi_sysfs_data)
17470 +{
17471 +       sysfs_remove_file(kobj, &toi_sysfs_data->attr);
17472 +}
17473 +EXPORT_SYMBOL_GPL(toi_unregister_sysfs_file);
17474 +
17475 +void toi_cleanup_sysfs(void)
17476 +{
17477 +       int i,
17478 +           numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
17479 +
17480 +       if (!toi_sysfs_initialised)
17481 +               return;
17482 +
17483 +       for (i = 0; i < numfiles; i++)
17484 +               toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
17485 +
17486 +       kobject_put(tuxonice_kobj);
17487 +       toi_sysfs_initialised = 0;
17488 +}
17489 +
17490 +/* toi_initialise_sysfs
17491 + *
17492 + * Initialise the /sysfs/tuxonice directory.
17493 + */
17494 +
17495 +static void toi_initialise_sysfs(void)
17496 +{
17497 +       int i;
17498 +       int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
17499 +
17500 +       if (toi_sysfs_initialised)
17501 +               return;
17502 +
17503 +       /* Make our TuxOnIce directory a child of /sys/power */
17504 +       tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
17505 +       if (!tuxonice_kobj)
17506 +               return;
17507 +
17508 +       toi_sysfs_initialised = 1;
17509 +
17510 +       for (i = 0; i < numfiles; i++)
17511 +               toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
17512 +}
17513 +
17514 +int toi_sysfs_init(void)
17515 +{
17516 +       toi_initialise_sysfs();
17517 +       return 0;
17518 +}
17519 +
17520 +void toi_sysfs_exit(void)
17521 +{
17522 +       toi_cleanup_sysfs();
17523 +}
17524 diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
17525 new file mode 100644
17526 index 0000000..bf56414
17527 --- /dev/null
17528 +++ b/kernel/power/tuxonice_sysfs.h
17529 @@ -0,0 +1,127 @@
17530 +/*
17531 + * kernel/power/tuxonice_sysfs.h
17532 + *
17533 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
17534 + *
17535 + * This file is released under the GPLv2.
17536 + */
17537 +
17538 +#include <linux/sysfs.h>
17539 +#include "power.h"
17540 +
17541 +struct toi_sysfs_data {
17542 +       struct attribute attr;
17543 +       int type;
17544 +       int flags;
17545 +       union {
17546 +               struct {
17547 +                       unsigned long *bit_vector;
17548 +                       int bit;
17549 +               } bit;
17550 +               struct {
17551 +                       int *variable;
17552 +                       int minimum;
17553 +                       int maximum;
17554 +               } integer;
17555 +               struct {
17556 +                       long *variable;
17557 +                       long minimum;
17558 +                       long maximum;
17559 +               } a_long;
17560 +               struct {
17561 +                       unsigned long *variable;
17562 +                       unsigned long minimum;
17563 +                       unsigned long maximum;
17564 +               } ul;
17565 +               struct {
17566 +                       char *variable;
17567 +                       int max_length;
17568 +               } string;
17569 +               struct {
17570 +                       int (*read_sysfs) (const char *buffer, int count);
17571 +                       int (*write_sysfs) (const char *buffer, int count);
17572 +                       void *data;
17573 +               } special;
17574 +       } data;
17575 +
17576 +       /* Side effects routines. Used, eg, for reparsing the
17577 +        * resume= entry when it changes */
17578 +       void (*read_side_effect) (void);
17579 +       void (*write_side_effect) (void);
17580 +       struct list_head sysfs_data_list;
17581 +};
17582 +
17583 +enum {
17584 +       TOI_SYSFS_DATA_NONE = 1,
17585 +       TOI_SYSFS_DATA_CUSTOM,
17586 +       TOI_SYSFS_DATA_BIT,
17587 +       TOI_SYSFS_DATA_INTEGER,
17588 +       TOI_SYSFS_DATA_UL,
17589 +       TOI_SYSFS_DATA_LONG,
17590 +       TOI_SYSFS_DATA_STRING
17591 +};
17592 +
17593 +#define TOI_ATTR(_name, _mode)      \
17594 +       .attr = {.name  = _name , .mode   = _mode }
17595 +
17596 +#define SYSFS_BIT(_ul, _bit, _flags) \
17597 +       .type = TOI_SYSFS_DATA_BIT, \
17598 +       .flags = _flags, \
17599 +       .data = { .bit = { .bit_vector = _ul, .bit = _bit } }
17600 +
17601 +#define SYSFS_INT(_int, _min, _max, _flags) \
17602 +       .type = TOI_SYSFS_DATA_INTEGER, \
17603 +       .flags = _flags, \
17604 +       .data = { .integer = { .variable = _int, .minimum = _min, \
17605 +                       .maximum = _max } }
17606 +
17607 +#define SYSFS_UL(_ul, _min, _max, _flags) \
17608 +       .type = TOI_SYSFS_DATA_UL, \
17609 +       .flags = _flags, \
17610 +       .data = { .ul = { .variable = _ul, .minimum = _min, \
17611 +                       .maximum = _max } }
17612 +
17613 +#define SYSFS_LONG(_long, _min, _max, _flags) \
17614 +       .type = TOI_SYSFS_DATA_LONG, \
17615 +       .flags = _flags, \
17616 +       .data = { .a_long = { .variable = _long, .minimum = _min, \
17617 +                       .maximum = _max } }
17618 +
17619 +#define SYSFS_STRING(_string, _max_len, _flags) \
17620 +       .type = TOI_SYSFS_DATA_STRING, \
17621 +       .flags = _flags, \
17622 +       .data = { .string = { .variable = _string, .max_length = _max_len } }
17623 +
17624 +#define SYSFS_CUSTOM(_read, _write, _flags) \
17625 +       .type = TOI_SYSFS_DATA_CUSTOM, \
17626 +       .flags = _flags, \
17627 +       .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }
17628 +
17629 +#define SYSFS_WRITEONLY 0200
17630 +#define SYSFS_READONLY 0444
17631 +#define SYSFS_RW 0644
17632 +
17633 +/* Flags */
17634 +#define SYSFS_NEEDS_SM_FOR_READ 1
17635 +#define SYSFS_NEEDS_SM_FOR_WRITE 2
17636 +#define SYSFS_HIBERNATE 4
17637 +#define SYSFS_RESUME 8
17638 +#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
17639 +#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
17640 +#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
17641 +#define SYSFS_NEEDS_SM_FOR_BOTH \
17642 + (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
17643 +
17644 +int toi_register_sysfs_file(struct kobject *kobj,
17645 +               struct toi_sysfs_data *toi_sysfs_data);
17646 +void toi_unregister_sysfs_file(struct kobject *kobj,
17647 +               struct toi_sysfs_data *toi_sysfs_data);
17648 +
17649 +extern struct kobject *tuxonice_kobj;
17650 +
17651 +struct kobject *make_toi_sysdir(char *name);
17652 +void remove_toi_sysdir(struct kobject *obj);
17653 +extern void toi_cleanup_sysfs(void);
17654 +
17655 +extern int toi_sysfs_init(void);
17656 +extern void toi_sysfs_exit(void);
17657 diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
17658 new file mode 100644
17659 index 0000000..125f21e
17660 --- /dev/null
17661 +++ b/kernel/power/tuxonice_ui.c
17662 @@ -0,0 +1,261 @@
17663 +/*
17664 + * kernel/power/tuxonice_ui.c
17665 + *
17666 + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
17667 + * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
17668 + * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
17669 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
17670 + *
17671 + * This file is released under the GPLv2.
17672 + *
17673 + * Routines for TuxOnIce's user interface.
17674 + *
17675 + * The user interface code talks to a userspace program via a
17676 + * netlink socket.
17677 + *
17678 + * The kernel side:
17679 + * - starts the userui program;
17680 + * - sends text messages and progress bar status;
17681 + *
17682 + * The user space side:
17683 + * - passes messages regarding user requests (abort, toggle reboot etc)
17684 + *
17685 + */
17686 +
17687 +#define __KERNEL_SYSCALLS__
17688 +
17689 +#include <linux/reboot.h>
17690 +
17691 +#include "tuxonice_sysfs.h"
17692 +#include "tuxonice_modules.h"
17693 +#include "tuxonice.h"
17694 +#include "tuxonice_ui.h"
17695 +#include "tuxonice_netlink.h"
17696 +#include "tuxonice_power_off.h"
17697 +#include "tuxonice_builtin.h"
17698 +
17699 +static char local_printf_buf[1024];    /* Same as printk - should be safe */
17700 +struct ui_ops *toi_current_ui;
17701 +
17702 +/**
17703 + * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
17704 + *
17705 + * @timeout: Maximum time to wait.
17706 + *
17707 + * Wait for a keypress, either from userui or /dev/console if userui isn't
17708 + * available. The non-userui path is particularly for at boot-time, prior
17709 + * to userui being started, when we have an important warning to give to
17710 + * the user.
17711 + */
17712 +static char toi_wait_for_keypress(int timeout)
17713 +{
17714 +       if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
17715 +               return ' ';
17716 +
17717 +       return toi_wait_for_keypress_dev_console(timeout);
17718 +}
17719 +
17720 +/* toi_early_boot_message()
17721 + * Description:        Handle errors early in the process of booting.
17722 + *             The user may press C to continue booting, perhaps
17723 + *             invalidating the image,  or space to reboot.
17724 + *             This works from either the serial console or normally
17725 + *             attached keyboard.
17726 + *
17727 + *             Note that we come in here from init, while the kernel is
17728 + *             locked. If we want to get events from the serial console,
17729 + *             we need to temporarily unlock the kernel.
17730 + *
17731 + *             toi_early_boot_message may also be called post-boot.
17732 + *             In this case, it simply printks the message and returns.
17733 + *
17734 + * Arguments:  int     Whether we are able to erase the image.
17735 + *             int     default_answer. What to do when we timeout. This
17736 + *                     will normally be continue, but the user might
17737 + *                     provide command line options (__setup) to override
17738 + *                     particular cases.
17739 + *             Char *. Pointer to a string explaining why we're moaning.
17740 + */
17741 +
17742 +#define say(message, a...) printk(KERN_EMERG message, ##a)
17743 +
17744 +void toi_early_boot_message(int message_detail, int default_answer,
17745 +       char *warning_reason, ...)
17746 +{
17747 +#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
17748 +       unsigned long orig_state = get_toi_state(), continue_req = 0;
17749 +       unsigned long orig_loglevel = console_loglevel;
17750 +       int can_ask = 1;
17751 +#else
17752 +       int can_ask = 0;
17753 +#endif
17754 +
17755 +       va_list args;
17756 +       int printed_len;
17757 +
17758 +       if (!toi_wait) {
17759 +               set_toi_state(TOI_CONTINUE_REQ);
17760 +               can_ask = 0;
17761 +       }
17762 +
17763 +       if (warning_reason) {
17764 +               va_start(args, warning_reason);
17765 +               printed_len = vsnprintf(local_printf_buf,
17766 +                               sizeof(local_printf_buf),
17767 +                               warning_reason,
17768 +                               args);
17769 +               va_end(args);
17770 +       }
17771 +
17772 +       if (!test_toi_state(TOI_BOOT_TIME)) {
17773 +               printk("TuxOnIce: %s\n", local_printf_buf);
17774 +               return;
17775 +       }
17776 +
17777 +       if (!can_ask) {
17778 +               continue_req = !!default_answer;
17779 +               goto post_ask;
17780 +       }
17781 +
17782 +#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
17783 +       console_loglevel = 7;
17784 +
17785 +       say("=== TuxOnIce ===\n\n");
17786 +       if (warning_reason) {
17787 +               say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
17788 +               switch (message_detail) {
17789 +               case 0:
17790 +                       say("If you continue booting, note that any image WILL"
17791 +                               "NOT BE REMOVED.\nTuxOnIce is unable to do so "
17792 +                               "because the appropriate modules aren't\n"
17793 +                               "loaded. You should manually remove the image "
17794 +                               "to avoid any\npossibility of corrupting your "
17795 +                               "filesystem(s) later.\n");
17796 +                       break;
17797 +               case 1:
17798 +                       say("If you want to use the current TuxOnIce image, "
17799 +                               "reboot and try\nagain with the same kernel "
17800 +                               "that you hibernated from. If you want\n"
17801 +                               "to forget that image, continue and the image "
17802 +                               "will be erased.\n");
17803 +                       break;
17804 +               }
17805 +               say("Press SPACE to reboot or C to continue booting with "
17806 +                       "this kernel\n\n");
17807 +               if (toi_wait > 0)
17808 +                       say("Default action if you don't select one in %d "
17809 +                               "seconds is: %s.\n",
17810 +                               toi_wait,
17811 +                               default_answer == TOI_CONTINUE_REQ ?
17812 +                               "continue booting" : "reboot");
17813 +       } else {
17814 +               say("BIG FAT WARNING!!\n\n"
17815 +                       "You have tried to resume from this image before.\n"
17816 +                       "If it failed once, it may well fail again.\n"
17817 +                       "Would you like to remove the image and boot "
17818 +                       "normally?\nThis will be equivalent to entering "
17819 +                       "noresume on the\nkernel command line.\n\n"
17820 +                       "Press SPACE to remove the image or C to continue "
17821 +                       "resuming.\n\n");
17822 +               if (toi_wait > 0)
17823 +                       say("Default action if you don't select one in %d "
17824 +                               "seconds is: %s.\n", toi_wait,
17825 +                               !!default_answer ?
17826 +                               "continue resuming" : "remove the image");
17827 +       }
17828 +       console_loglevel = orig_loglevel;
17829 +
17830 +       set_toi_state(TOI_SANITY_CHECK_PROMPT);
17831 +       clear_toi_state(TOI_CONTINUE_REQ);
17832 +
17833 +       if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
17834 +               continue_req = !!default_answer;
17835 +       else
17836 +               continue_req = test_toi_state(TOI_CONTINUE_REQ);
17837 +
17838 +#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
17839 +
17840 +post_ask:
17841 +       if ((warning_reason) && (!continue_req))
17842 +               machine_restart(NULL);
17843 +
17844 +       restore_toi_state(orig_state);
17845 +       if (continue_req)
17846 +               set_toi_state(TOI_CONTINUE_REQ);
17847 +}
17848 +#undef say
17849 +
17850 +/*
17851 + * User interface specific /sys/power/tuxonice entries.
17852 + */
17853 +
17854 +static struct toi_sysfs_data sysfs_params[] = {
17855 +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
17856 +       { TOI_ATTR("default_console_level", SYSFS_RW),
17857 +         SYSFS_INT(&toi_bkd.toi_default_console_level, 0, 7, 0)
17858 +       },
17859 +
17860 +       { TOI_ATTR("debug_sections", SYSFS_RW),
17861 +         SYSFS_UL(&toi_bkd.toi_debug_state, 0, 1 << 30, 0)
17862 +       },
17863 +
17864 +       { TOI_ATTR("log_everything", SYSFS_RW),
17865 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_LOGALL, 0)
17866 +       },
17867 +#endif
17868 +       { TOI_ATTR("pm_prepare_console", SYSFS_RW),
17869 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_PM_PREPARE_CONSOLE, 0)
17870 +       }
17871 +};
17872 +
17873 +static struct toi_module_ops userui_ops = {
17874 +       .type                           = MISC_HIDDEN_MODULE,
17875 +       .name                           = "printk ui",
17876 +       .directory                      = "user_interface",
17877 +       .module                         = THIS_MODULE,
17878 +       .sysfs_data                     = sysfs_params,
17879 +       .num_sysfs_entries              = sizeof(sysfs_params) /
17880 +               sizeof(struct toi_sysfs_data),
17881 +};
17882 +
17883 +int toi_register_ui_ops(struct ui_ops *this_ui)
17884 +{
17885 +       if (toi_current_ui) {
17886 +               printk(KERN_INFO "Only one TuxOnIce user interface module can "
17887 +                               "be loaded at a time.");
17888 +               return -EBUSY;
17889 +       }
17890 +
17891 +       toi_current_ui = this_ui;
17892 +
17893 +       return 0;
17894 +}
17895 +
17896 +void toi_remove_ui_ops(struct ui_ops *this_ui)
17897 +{
17898 +       if (toi_current_ui != this_ui)
17899 +               return;
17900 +
17901 +       toi_current_ui = NULL;
17902 +}
17903 +
17904 +/* toi_console_sysfs_init
17905 + * Description: Boot time initialisation for user interface.
17906 + */
17907 +
17908 +int toi_ui_init(void)
17909 +{
17910 +       return toi_register_module(&userui_ops);
17911 +}
17912 +
17913 +void toi_ui_exit(void)
17914 +{
17915 +       toi_unregister_module(&userui_ops);
17916 +}
17917 +
17918 +#ifdef CONFIG_TOI_EXPORTS
17919 +EXPORT_SYMBOL_GPL(toi_current_ui);
17920 +EXPORT_SYMBOL_GPL(toi_early_boot_message);
17921 +EXPORT_SYMBOL_GPL(toi_register_ui_ops);
17922 +EXPORT_SYMBOL_GPL(toi_remove_ui_ops);
17923 +#endif
17924 diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
17925 new file mode 100644
17926 index 0000000..7825ddf
17927 --- /dev/null
17928 +++ b/kernel/power/tuxonice_ui.h
17929 @@ -0,0 +1,104 @@
17930 +/*
17931 + * kernel/power/tuxonice_ui.h
17932 + *
17933 + * Copyright (C) 2004-2007 Nigel Cunningham (nigel at tuxonice net)
17934 + */
17935 +
17936 +enum {
17937 +       DONT_CLEAR_BAR,
17938 +       CLEAR_BAR
17939 +};
17940 +
17941 +enum {
17942 +       /* Userspace -> Kernel */
17943 +       USERUI_MSG_ABORT = 0x11,
17944 +       USERUI_MSG_SET_STATE = 0x12,
17945 +       USERUI_MSG_GET_STATE = 0x13,
17946 +       USERUI_MSG_GET_DEBUG_STATE = 0x14,
17947 +       USERUI_MSG_SET_DEBUG_STATE = 0x15,
17948 +       USERUI_MSG_SPACE = 0x18,
17949 +       USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
17950 +       USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
17951 +       USERUI_MSG_GET_LOGLEVEL = 0x1C,
17952 +       USERUI_MSG_SET_LOGLEVEL = 0x1D,
17953 +       USERUI_MSG_PRINTK = 0x1E,
17954 +
17955 +       /* Kernel -> Userspace */
17956 +       USERUI_MSG_MESSAGE = 0x21,
17957 +       USERUI_MSG_PROGRESS = 0x22,
17958 +       USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
17959 +
17960 +       USERUI_MSG_MAX,
17961 +};
17962 +
17963 +struct userui_msg_params {
17964 +       unsigned long a, b, c, d;
17965 +       char text[255];
17966 +};
17967 +
17968 +struct ui_ops {
17969 +       char (*wait_for_key) (int timeout);
17970 +       unsigned long (*update_status) (unsigned long value,
17971 +               unsigned long maximum, const char *fmt, ...);
17972 +       void (*prepare_status) (int clearbar, const char *fmt, ...);
17973 +       void (*cond_pause) (int pause, char *message);
17974 +       void (*abort)(int result_code, const char *fmt, ...);
17975 +       void (*prepare)(void);
17976 +       void (*cleanup)(void);
17977 +       void (*post_atomic_restore)(void);
17978 +       void (*message)(unsigned long section, unsigned long level,
17979 +               int normally_logged, const char *fmt, ...);
17980 +};
17981 +
17982 +extern struct ui_ops *toi_current_ui;
17983 +
17984 +#define toi_update_status(val, max, fmt, args...) \
17985 + (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
17986 +       max)
17987 +
17988 +#define toi_ui_post_atomic_restore(void) \
17989 +       do { if (toi_current_ui) \
17990 +               (toi_current_ui->post_atomic_restore)(); \
17991 +       } while (0)
17992 +
17993 +#define toi_prepare_console(void) \
17994 +       do { if (toi_current_ui) \
17995 +               (toi_current_ui->prepare)(); \
17996 +       } while (0)
17997 +
17998 +#define toi_cleanup_console(void) \
17999 +       do { if (toi_current_ui) \
18000 +               (toi_current_ui->cleanup)(); \
18001 +       } while (0)
18002 +
18003 +#define abort_hibernate(result, fmt, args...) \
18004 +       do { if (toi_current_ui) \
18005 +               (toi_current_ui->abort)(result, fmt, ##args); \
18006 +            else { \
18007 +               set_abort_result(result); \
18008 +            } \
18009 +       } while (0)
18010 +
18011 +#define toi_cond_pause(pause, message) \
18012 +       do { if (toi_current_ui) \
18013 +               (toi_current_ui->cond_pause)(pause, message); \
18014 +       } while (0)
18015 +
18016 +#define toi_prepare_status(clear, fmt, args...) \
18017 +       do { if (toi_current_ui) \
18018 +               (toi_current_ui->prepare_status)(clear, fmt, ##args); \
18019 +            else \
18020 +               printk(KERN_ERR fmt "%s", ##args, "\n"); \
18021 +       } while (0)
18022 +
18023 +#define toi_message(sn, lev, log, fmt, a...) \
18024 +do { \
18025 +       if (toi_current_ui && (!sn || test_debug_state(sn))) \
18026 +               toi_current_ui->message(sn, lev, log, fmt, ##a); \
18027 +} while (0)
18028 +
18029 +__exit void toi_ui_cleanup(void);
18030 +extern int toi_ui_init(void);
18031 +extern void toi_ui_exit(void);
18032 +extern int toi_register_ui_ops(struct ui_ops *this_ui);
18033 +extern void toi_remove_ui_ops(struct ui_ops *this_ui);
18034 diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
18035 new file mode 100644
18036 index 0000000..3454709
18037 --- /dev/null
18038 +++ b/kernel/power/tuxonice_userui.c
18039 @@ -0,0 +1,675 @@
18040 +/*
18041 + * kernel/power/user_ui.c
18042 + *
18043 + * Copyright (C) 2005-2007 Bernard Blackham
18044 + * Copyright (C) 2002-2007 Nigel Cunningham (nigel at tuxonice net)
18045 + *
18046 + * This file is released under the GPLv2.
18047 + *
18048 + * Routines for TuxOnIce's user interface.
18049 + *
18050 + * The user interface code talks to a userspace program via a
18051 + * netlink socket.
18052 + *
18053 + * The kernel side:
18054 + * - starts the userui program;
18055 + * - sends text messages and progress bar status;
18056 + *
18057 + * The user space side:
18058 + * - passes messages regarding user requests (abort, toggle reboot etc)
18059 + *
18060 + */
18061 +
18062 +#define __KERNEL_SYSCALLS__
18063 +
18064 +#include <linux/suspend.h>
18065 +#include <linux/freezer.h>
18066 +#include <linux/console.h>
18067 +#include <linux/ctype.h>
18068 +#include <linux/tty.h>
18069 +#include <linux/vt_kern.h>
18070 +#include <linux/module.h>
18071 +#include <linux/reboot.h>
18072 +#include <linux/kmod.h>
18073 +#include <linux/security.h>
18074 +#include <linux/syscalls.h>
18075 +
18076 +#include "tuxonice_sysfs.h"
18077 +#include "tuxonice_modules.h"
18078 +#include "tuxonice.h"
18079 +#include "tuxonice_ui.h"
18080 +#include "tuxonice_netlink.h"
18081 +#include "tuxonice_power_off.h"
18082 +
18083 +static char local_printf_buf[1024];    /* Same as printk - should be safe */
18084 +
18085 +static struct user_helper_data ui_helper_data;
18086 +static struct toi_module_ops userui_ops;
18087 +static int orig_kmsg;
18088 +
18089 +static char lastheader[512];
18090 +static int lastheader_message_len;
18091 +static int ui_helper_changed; /* Used at resume-time so don't overwrite value
18092 +                               set from initrd/ramfs. */
18093 +
18094 +/* Number of distinct progress amounts that userspace can display */
18095 +static int progress_granularity = 30;
18096 +
18097 +static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
18098 +
18099 +/**
18100 + * ui_nl_set_state - Update toi_action based on a message from userui.
18101 + *
18102 + * @n: The bit (1 << bit) to set.
18103 + */
18104 +static void ui_nl_set_state(int n)
18105 +{
18106 +       /* Only let them change certain settings */
18107 +       static const int toi_action_mask =
18108 +               (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
18109 +               (1 << TOI_LOGALL) |
18110 +               (1 << TOI_SINGLESTEP) |
18111 +               (1 << TOI_PAUSE_NEAR_PAGESET_END);
18112 +
18113 +       toi_bkd.toi_action = (toi_bkd.toi_action & (~toi_action_mask)) |
18114 +               (n & toi_action_mask);
18115 +
18116 +       if (!test_action_state(TOI_PAUSE) &&
18117 +                       !test_action_state(TOI_SINGLESTEP))
18118 +               wake_up_interruptible(&userui_wait_for_key);
18119 +}
18120 +
18121 +/**
18122 + * userui_post_atomic_restore - Tell userui that atomic restore just happened.
18123 + *
18124 + * Tell userui that atomic restore just occured, so that it can do things like
18125 + * redrawing the screen, re-getting settings and so on.
18126 + */
18127 +static void userui_post_atomic_restore(void)
18128 +{
18129 +       toi_send_netlink_message(&ui_helper_data,
18130 +                       USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
18131 +}
18132 +
18133 +/**
18134 + * userui_storage_needed - Report how much memory in image header is needed.
18135 + */
18136 +static int userui_storage_needed(void)
18137 +{
18138 +       return sizeof(ui_helper_data.program) + 1 + sizeof(int);
18139 +}
18140 +
18141 +/**
18142 + * userui_save_config_info - Fill buffer with config info for image header.
18143 + *
18144 + * @buf: Buffer into which to put the config info we want to save.
18145 + */
18146 +static int userui_save_config_info(char *buf)
18147 +{
18148 +       *((int *) buf) = progress_granularity;
18149 +       memcpy(buf + sizeof(int), ui_helper_data.program,
18150 +                       sizeof(ui_helper_data.program));
18151 +       return sizeof(ui_helper_data.program) + sizeof(int) + 1;
18152 +}
18153 +
18154 +/**
18155 + * userui_load_config_info - Restore config info from buffer.
18156 + *
18157 + * @buf: Buffer containing header info loaded.
18158 + * @size: Size of data loaded for this module.
18159 + */
18160 +static void userui_load_config_info(char *buf, int size)
18161 +{
18162 +       progress_granularity = *((int *) buf);
18163 +       size -= sizeof(int);
18164 +
18165 +       /* Don't load the saved path if one has already been set */
18166 +       if (ui_helper_changed)
18167 +               return;
18168 +
18169 +       if (size > sizeof(ui_helper_data.program))
18170 +               size = sizeof(ui_helper_data.program);
18171 +
18172 +       memcpy(ui_helper_data.program, buf + sizeof(int), size);
18173 +       ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
18174 +}
18175 +
18176 +/**
18177 + * set_ui_program_set: Record that userui program was changed.
18178 + *
18179 + * Side effect routine for when the userui program is set. In an initrd or
18180 + * ramfs, the user may set a location for the userui program. If this happens,
18181 + * we don't want to reload the value that was saved in the image header. This
18182 + * routine allows us to flag that we shouldn't restore the program name from
18183 + * the image header.
18184 + */
18185 +static void set_ui_program_set(void)
18186 +{
18187 +       ui_helper_changed = 1;
18188 +}
18189 +
18190 +/**
18191 + * userui_memory_needed - Tell core how much memory to reserve for us.
18192 + */
18193 +static int userui_memory_needed(void)
18194 +{
18195 +       /* ball park figure of 128 pages */
18196 +       return (128 * PAGE_SIZE);
18197 +}
18198 +
18199 +/**
18200 + * userui_update_status - Update the progress bar and (if on) in-bar message.
18201 + *
18202 + * @value: Current progress percentage numerator.
18203 + * @maximum: Current progress percentage denominator.
18204 + * @fmt: Message to be displayed in the middle of the progress bar.
18205 + *
18206 + * Note that a NULL message does not mean that any previous message is erased!
18207 + * For that, you need toi_prepare_status with clearbar on.
18208 + *
18209 + * Returns an unsigned long, being the next numerator (as determined by the
18210 + * maximum and progress granularity) where status needs to be updated.
18211 + * This is to reduce unnecessary calls to update_status.
18212 + */
18213 +static unsigned long userui_update_status(unsigned long value,
18214 +               unsigned long maximum, const char *fmt, ...)
18215 +{
18216 +       static int last_step = -1;
18217 +       struct userui_msg_params msg;
18218 +       int bitshift;
18219 +       int this_step;
18220 +       unsigned long next_update;
18221 +
18222 +       if (ui_helper_data.pid == -1)
18223 +               return 0;
18224 +
18225 +       if ((!maximum) || (!progress_granularity))
18226 +               return maximum;
18227 +
18228 +       if (value < 0)
18229 +               value = 0;
18230 +
18231 +       if (value > maximum)
18232 +               value = maximum;
18233 +
18234 +       /* Try to avoid math problems - we can't do 64 bit math here
18235 +        * (and shouldn't need it - anyone got screen resolution
18236 +        * of 65536 pixels or more?) */
18237 +       bitshift = fls(maximum) - 16;
18238 +       if (bitshift > 0) {
18239 +               unsigned long temp_maximum = maximum >> bitshift;
18240 +               unsigned long temp_value = value >> bitshift;
18241 +               this_step = (int)
18242 +                       (temp_value * progress_granularity / temp_maximum);
18243 +               next_update = (((this_step + 1) * temp_maximum /
18244 +                                       progress_granularity) + 1) << bitshift;
18245 +       } else {
18246 +               this_step = (int) (value * progress_granularity / maximum);
18247 +               next_update = ((this_step + 1) * maximum /
18248 +                               progress_granularity) + 1;
18249 +       }
18250 +
18251 +       if (this_step == last_step)
18252 +               return next_update;
18253 +
18254 +       memset(&msg, 0, sizeof(msg));
18255 +
18256 +       msg.a = this_step;
18257 +       msg.b = progress_granularity;
18258 +
18259 +       if (fmt) {
18260 +               va_list args;
18261 +               va_start(args, fmt);
18262 +               vsnprintf(msg.text, sizeof(msg.text), fmt, args);
18263 +               va_end(args);
18264 +               msg.text[sizeof(msg.text)-1] = '\0';
18265 +       }
18266 +
18267 +       toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
18268 +                       &msg, sizeof(msg));
18269 +       last_step = this_step;
18270 +
18271 +       return next_update;
18272 +}
18273 +
18274 +/**
18275 + * userui_message - Display a message without necessarily logging it.
18276 + *
18277 + * @section: Type of message. Messages can be filtered by type.
18278 + * @level: Degree of importance of the message. Lower values = higher priority.
18279 + * @normally_logged: Whether logged even if log_everything is off.
18280 + * @fmt: Message (and parameters).
18281 + *
18282 + * This function is intended to do the same job as printk, but without normally
18283 + * logging what is printed. The point is to be able to get debugging info on
18284 + * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
18285 + *
18286 + * It may be called from an interrupt context - can't sleep!
18287 + */
18288 +static void userui_message(unsigned long section, unsigned long level,
18289 +               int normally_logged, const char *fmt, ...)
18290 +{
18291 +       struct userui_msg_params msg;
18292 +
18293 +       if ((level) && (level > console_loglevel))
18294 +               return;
18295 +
18296 +       memset(&msg, 0, sizeof(msg));
18297 +
18298 +       msg.a = section;
18299 +       msg.b = level;
18300 +       msg.c = normally_logged;
18301 +
18302 +       if (fmt) {
18303 +               va_list args;
18304 +               va_start(args, fmt);
18305 +               vsnprintf(msg.text, sizeof(msg.text), fmt, args);
18306 +               va_end(args);
18307 +               msg.text[sizeof(msg.text)-1] = '\0';
18308 +       }
18309 +
18310 +       if (test_action_state(TOI_LOGALL))
18311 +               printk(KERN_INFO "%s\n", msg.text);
18312 +
18313 +       toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
18314 +                       &msg, sizeof(msg));
18315 +}
18316 +
18317 +/**
18318 + * wait_for_key_via_userui - Wait for userui to receive a keypress.
18319 + */
18320 +static void wait_for_key_via_userui(void)
18321 +{
18322 +       DECLARE_WAITQUEUE(wait, current);
18323 +
18324 +       add_wait_queue(&userui_wait_for_key, &wait);
18325 +       set_current_state(TASK_INTERRUPTIBLE);
18326 +
18327 +       interruptible_sleep_on(&userui_wait_for_key);
18328 +
18329 +       set_current_state(TASK_RUNNING);
18330 +       remove_wait_queue(&userui_wait_for_key, &wait);
18331 +}
18332 +
18333 +/**
18334 + * userui_prepare_status - Display high level messages.
18335 + *
18336 + * @clearbar: Whether to clear the progress bar.
18337 + * @fmt...: New message for the title.
18338 + *
18339 + * Prepare the 'nice display', drawing the header and version, along with the
18340 + * current action and perhaps also resetting the progress bar.
18341 + */
18342 +static void userui_prepare_status(int clearbar, const char *fmt, ...)
18343 +{
18344 +       va_list args;
18345 +
18346 +       if (fmt) {
18347 +               va_start(args, fmt);
18348 +               lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
18349 +               va_end(args);
18350 +       }
18351 +
18352 +       if (clearbar)
18353 +               toi_update_status(0, 1, NULL);
18354 +
18355 +       if (ui_helper_data.pid == -1)
18356 +               printk(KERN_EMERG "%s\n", lastheader);
18357 +       else
18358 +               toi_message(0, TOI_STATUS, 1, lastheader, NULL);
18359 +}
18360 +
18361 +/**
18362 + * toi_wait_for_keypress - Wait for keypress via userui.
18363 + *
18364 + * @timeout: Maximum time to wait.
18365 + *
18366 + * Wait for a keypress from userui.
18367 + *
18368 + * FIXME: Implement timeout?
18369 + */
18370 +static char userui_wait_for_keypress(int timeout)
18371 +{
18372 +       char key = '\0';
18373 +
18374 +       if (ui_helper_data.pid != -1) {
18375 +               wait_for_key_via_userui();
18376 +               key = ' ';
18377 +       }
18378 +
18379 +       return key;
18380 +}
18381 +
18382 +/**
18383 + * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
18384 + *
18385 + * @result_code: Reason why we're aborting (1 << bit).
18386 + * @fmt: Message to display if telling the user what's going on.
18387 + *
18388 + * Abort a cycle. If this wasn't at the user's request (and we're displaying
18389 + * output), tell the user why and wait for them to acknowledge the message.
18390 + */
18391 +static void userui_abort_hibernate(int result_code, const char *fmt, ...)
18392 +{
18393 +       va_list args;
18394 +       int printed_len = 0;
18395 +
18396 +       set_result_state(result_code);
18397 +
18398 +       if (test_result_state(TOI_ABORTED))
18399 +               return;
18400 +
18401 +       set_result_state(TOI_ABORTED);
18402 +
18403 +       if (test_result_state(TOI_ABORT_REQUESTED))
18404 +               return;
18405 +
18406 +       va_start(args, fmt);
18407 +       printed_len = vsnprintf(local_printf_buf,  sizeof(local_printf_buf),
18408 +                       fmt, args);
18409 +       va_end(args);
18410 +       if (ui_helper_data.pid != -1)
18411 +               printed_len = sprintf(local_printf_buf + printed_len,
18412 +                                       " (Press SPACE to continue)");
18413 +
18414 +       toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
18415 +
18416 +       if (ui_helper_data.pid != -1)
18417 +               userui_wait_for_keypress(0);
18418 +}
18419 +
18420 +/**
18421 + * request_abort_hibernate - Abort hibernating or resuming at user request.
18422 + *
18423 + * Handle the user requesting the cancellation of a hibernation or resume by
18424 + * pressing escape.
18425 + */
18426 +static void request_abort_hibernate(void)
18427 +{
18428 +       if (test_result_state(TOI_ABORT_REQUESTED))
18429 +               return;
18430 +
18431 +       if (test_toi_state(TOI_NOW_RESUMING)) {
18432 +               toi_prepare_status(CLEAR_BAR, "Escape pressed. "
18433 +                                       "Powering down again.");
18434 +               set_toi_state(TOI_STOP_RESUME);
18435 +               while (!test_toi_state(TOI_IO_STOPPED))
18436 +                       schedule();
18437 +               if (toiActiveAllocator->mark_resume_attempted)
18438 +                       toiActiveAllocator->mark_resume_attempted(0);
18439 +               toi_power_down();
18440 +       }
18441 +
18442 +       toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
18443 +                                       " ABORTING HIBERNATION ---");
18444 +       set_abort_result(TOI_ABORT_REQUESTED);
18445 +       wake_up_interruptible(&userui_wait_for_key);
18446 +}
18447 +
18448 +/**
18449 + * userui_user_rcv_msg - Receive a netlink message from userui.
18450 + *
18451 + * @skb: skb received.
18452 + * @nlh: Netlink header received.
18453 + */
18454 +static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
18455 +{
18456 +       int type;
18457 +       int *data;
18458 +
18459 +       type = nlh->nlmsg_type;
18460 +
18461 +       /* A control message: ignore them */
18462 +       if (type < NETLINK_MSG_BASE)
18463 +               return 0;
18464 +
18465 +       /* Unknown message: reply with EINVAL */
18466 +       if (type >= USERUI_MSG_MAX)
18467 +               return -EINVAL;
18468 +
18469 +       /* All operations require privileges, even GET */
18470 +       if (security_netlink_recv(skb, CAP_NET_ADMIN))
18471 +               return -EPERM;
18472 +
18473 +       /* Only allow one task to receive NOFREEZE privileges */
18474 +       if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
18475 +               printk(KERN_INFO "Got NOFREEZE_ME request when "
18476 +                       "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
18477 +               return -EBUSY;
18478 +       }
18479 +
18480 +       data = (int *) NLMSG_DATA(nlh);
18481 +
18482 +       switch (type) {
18483 +       case USERUI_MSG_ABORT:
18484 +               request_abort_hibernate();
18485 +               return 0;
18486 +       case USERUI_MSG_GET_STATE:
18487 +               toi_send_netlink_message(&ui_helper_data,
18488 +                               USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
18489 +                               sizeof(toi_bkd.toi_action));
18490 +               return 0;
18491 +       case USERUI_MSG_GET_DEBUG_STATE:
18492 +               toi_send_netlink_message(&ui_helper_data,
18493 +                               USERUI_MSG_GET_DEBUG_STATE,
18494 +                               &toi_bkd.toi_debug_state,
18495 +                               sizeof(toi_bkd.toi_debug_state));
18496 +               return 0;
18497 +       case USERUI_MSG_SET_STATE:
18498 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18499 +                       return -EINVAL;
18500 +               ui_nl_set_state(*data);
18501 +               return 0;
18502 +       case USERUI_MSG_SET_DEBUG_STATE:
18503 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18504 +                       return -EINVAL;
18505 +               toi_bkd.toi_debug_state = (*data);
18506 +               return 0;
18507 +       case USERUI_MSG_SPACE:
18508 +               wake_up_interruptible(&userui_wait_for_key);
18509 +               return 0;
18510 +       case USERUI_MSG_GET_POWERDOWN_METHOD:
18511 +               toi_send_netlink_message(&ui_helper_data,
18512 +                               USERUI_MSG_GET_POWERDOWN_METHOD,
18513 +                               &toi_poweroff_method,
18514 +                               sizeof(toi_poweroff_method));
18515 +               return 0;
18516 +       case USERUI_MSG_SET_POWERDOWN_METHOD:
18517 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18518 +                       return -EINVAL;
18519 +               toi_poweroff_method = (*data);
18520 +               return 0;
18521 +       case USERUI_MSG_GET_LOGLEVEL:
18522 +               toi_send_netlink_message(&ui_helper_data,
18523 +                               USERUI_MSG_GET_LOGLEVEL,
18524 +                               &toi_bkd.toi_default_console_level,
18525 +                               sizeof(toi_bkd.toi_default_console_level));
18526 +               return 0;
18527 +       case USERUI_MSG_SET_LOGLEVEL:
18528 +               if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
18529 +                       return -EINVAL;
18530 +               toi_bkd.toi_default_console_level = (*data);
18531 +               return 0;
18532 +       case USERUI_MSG_PRINTK:
18533 +               printk((char *) data);
18534 +               return 0;
18535 +       }
18536 +
18537 +       /* Unhandled here */
18538 +       return 1;
18539 +}
18540 +
18541 +/**
18542 + * userui_cond_pause - Possibly pause at user request.
18543 + *
18544 + * @pause: Whether to pause or just display the message.
18545 + * @message: Message to display at the start of pausing.
18546 + *
18547 + * Potentially pause and wait for the user to tell us to continue. We normally
18548 + * only pause when @pause is set. While paused, the user can do things like
18549 + * changing the loglevel, toggling the display of debugging sections and such
18550 + * like.
18551 + */
18552 +static void userui_cond_pause(int pause, char *message)
18553 +{
18554 +       int displayed_message = 0, last_key = 0;
18555 +
18556 +       while (last_key != 32 &&
18557 +               ui_helper_data.pid != -1 &&
18558 +               ((test_action_state(TOI_PAUSE) && pause) ||
18559 +                (test_action_state(TOI_SINGLESTEP)))) {
18560 +               if (!displayed_message) {
18561 +                       toi_prepare_status(DONT_CLEAR_BAR,
18562 +                          "%s Press SPACE to continue.%s",
18563 +                          message ? message : "",
18564 +                          (test_action_state(TOI_SINGLESTEP)) ?
18565 +                          " Single step on." : "");
18566 +                       displayed_message = 1;
18567 +               }
18568 +               last_key = userui_wait_for_keypress(0);
18569 +       }
18570 +       schedule();
18571 +}
18572 +
18573 +/**
18574 + * userui_prepare_console - Prepare the console for use.
18575 + *
18576 + * Prepare a console for use, saving current kmsg settings and attempting to
18577 + * start userui. Console loglevel changes are handled by userui.
18578 + */
18579 +static void userui_prepare_console(void)
18580 +{
18581 +       orig_kmsg = kmsg_redirect;
18582 +       kmsg_redirect = fg_console + 1;
18583 +
18584 +       ui_helper_data.pid = -1;
18585 +
18586 +       if (!userui_ops.enabled) {
18587 +               printk("TuxOnIce: Userui disabled.\n");
18588 +               return;
18589 +       }
18590 +
18591 +       if (*ui_helper_data.program)
18592 +               toi_netlink_setup(&ui_helper_data);
18593 +       else
18594 +               printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
18595 +}
18596 +
18597 +/**
18598 + * userui_cleanup_console - Cleanup after a cycle.
18599 + *
18600 + * Tell userui to cleanup, and restore kmsg_redirect to its original value.
18601 + */
18602 +
18603 +static void userui_cleanup_console(void)
18604 +{
18605 +       if (ui_helper_data.pid > -1)
18606 +               toi_netlink_close(&ui_helper_data);
18607 +
18608 +       kmsg_redirect = orig_kmsg;
18609 +}
18610 +
18611 +/*
18612 + * User interface specific /sys/power/tuxonice entries.
18613 + */
18614 +
18615 +static struct toi_sysfs_data sysfs_params[] = {
18616 +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
18617 +       { TOI_ATTR("enable_escape", SYSFS_RW),
18618 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_CAN_CANCEL, 0)
18619 +       },
18620 +
18621 +       { TOI_ATTR("pause_between_steps", SYSFS_RW),
18622 +         SYSFS_BIT(&toi_bkd.toi_action, TOI_PAUSE, 0)
18623 +       },
18624 +
18625 +       { TOI_ATTR("enabled", SYSFS_RW),
18626 +         SYSFS_INT(&userui_ops.enabled, 0, 1, 0)
18627 +       },
18628 +
18629 +       { TOI_ATTR("progress_granularity", SYSFS_RW),
18630 +         SYSFS_INT(&progress_granularity, 1, 2048, 0)
18631 +       },
18632 +
18633 +       { TOI_ATTR("program", SYSFS_RW),
18634 +         SYSFS_STRING(ui_helper_data.program, 255, 0),
18635 +         .write_side_effect = set_ui_program_set,
18636 +       },
18637 +#endif
18638 +};
18639 +
18640 +static struct toi_module_ops userui_ops = {
18641 +       .type                           = MISC_MODULE,
18642 +       .name                           = "userui",
18643 +       .shared_directory               = "user_interface",
18644 +       .module                         = THIS_MODULE,
18645 +       .storage_needed                 = userui_storage_needed,
18646 +       .save_config_info               = userui_save_config_info,
18647 +       .load_config_info               = userui_load_config_info,
18648 +       .memory_needed                  = userui_memory_needed,
18649 +       .sysfs_data                     = sysfs_params,
18650 +       .num_sysfs_entries              = sizeof(sysfs_params) /
18651 +               sizeof(struct toi_sysfs_data),
18652 +};
18653 +
18654 +static struct ui_ops my_ui_ops = {
18655 +       .post_atomic_restore            = userui_post_atomic_restore,
18656 +       .update_status                  = userui_update_status,
18657 +       .message                        = userui_message,
18658 +       .prepare_status                 = userui_prepare_status,
18659 +       .abort                          = userui_abort_hibernate,
18660 +       .cond_pause                     = userui_cond_pause,
18661 +       .prepare                        = userui_prepare_console,
18662 +       .cleanup                        = userui_cleanup_console,
18663 +       .wait_for_key                   = userui_wait_for_keypress,
18664 +};
18665 +
18666 +/**
18667 + * toi_user_ui_init - Boot time initialisation for user interface.
18668 + *
18669 + * Invoked from the core init routine.
18670 + */
18671 +static __init int toi_user_ui_init(void)
18672 +{
18673 +       int result;
18674 +
18675 +       ui_helper_data.nl = NULL;
18676 +       strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
18677 +       ui_helper_data.pid = -1;
18678 +       ui_helper_data.skb_size = sizeof(struct userui_msg_params);
18679 +       ui_helper_data.pool_limit = 6;
18680 +       ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
18681 +       ui_helper_data.name = "userspace ui";
18682 +       ui_helper_data.rcv_msg = userui_user_rcv_msg;
18683 +       ui_helper_data.interface_version = 7;
18684 +       ui_helper_data.must_init = 0;
18685 +       ui_helper_data.not_ready = userui_cleanup_console;
18686 +       init_completion(&ui_helper_data.wait_for_process);
18687 +       result = toi_register_module(&userui_ops);
18688 +       if (!result)
18689 +               result = toi_register_ui_ops(&my_ui_ops);
18690 +       if (result)
18691 +               toi_unregister_module(&userui_ops);
18692 +
18693 +       return result;
18694 +}
18695 +
18696 +#ifdef MODULE
18697 +/**
18698 + * toi_user_ui_ext - Cleanup code for if the core is unloaded.
18699 + */
18700 +static __exit void toi_user_ui_exit(void)
18701 +{
18702 +       toi_netlink_close_complete(&ui_helper_data);
18703 +       toi_remove_ui_ops(&my_ui_ops);
18704 +       toi_unregister_module(&userui_ops);
18705 +}
18706 +
18707 +module_init(toi_user_ui_init);
18708 +module_exit(toi_user_ui_exit);
18709 +MODULE_AUTHOR("Nigel Cunningham");
18710 +MODULE_DESCRIPTION("TuxOnIce Userui Support");
18711 +MODULE_LICENSE("GPL");
18712 +#else
18713 +late_initcall(toi_user_ui_init);
18714 +#endif
18715 diff --git a/kernel/printk.c b/kernel/printk.c
18716 index b51b156..2f4e8da 100644
18717 --- a/kernel/printk.c
18718 +++ b/kernel/printk.c
18719 @@ -32,6 +32,8 @@
18720  #include <linux/security.h>
18721  #include <linux/bootmem.h>
18722  #include <linux/syscalls.h>
18723 +#include <linux/jiffies.h>
18724 +#include <linux/suspend.h>
18725  
18726  #include <asm/uaccess.h>
18727  
18728 @@ -101,9 +103,12 @@ static DEFINE_SPINLOCK(logbuf_lock);
18729   * The indices into log_buf are not constrained to log_buf_len - they
18730   * must be masked before subscripting
18731   */
18732 -static unsigned log_start;     /* Index into log_buf: next char to be read by syslog() */
18733 -static unsigned con_start;     /* Index into log_buf: next char to be sent to consoles */
18734 -static unsigned log_end;       /* Index into log_buf: most-recently-written-char + 1 */
18735 +/* Index into log_buf: next char to be read by syslog() */
18736 +static unsigned POSS_NOSAVE log_start;
18737 +/* Index into log_buf: next char to be sent to consoles */
18738 +static unsigned POSS_NOSAVE con_start;
18739 +/* Index into log_buf: most-recently-written-char + 1 */
18740 +static unsigned POSS_NOSAVE log_end;
18741  
18742  /*
18743   *     Array of consoles built from command line options (console=)
18744 @@ -131,10 +136,11 @@ static int console_may_schedule;
18745  
18746  #ifdef CONFIG_PRINTK
18747  
18748 -static char __log_buf[__LOG_BUF_LEN];
18749 -static char *log_buf = __log_buf;
18750 -static int log_buf_len = __LOG_BUF_LEN;
18751 -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
18752 +static POSS_NOSAVE char __log_buf[__LOG_BUF_LEN];
18753 +static POSS_NOSAVE char *log_buf = __log_buf;
18754 +static POSS_NOSAVE int log_buf_len = __LOG_BUF_LEN;
18755 +/* Number of chars produced since last read+clear operation */
18756 +static POSS_NOSAVE unsigned logged_chars;
18757  
18758  static int __init log_buf_len_setup(char *str)
18759  {
18760 @@ -937,6 +943,7 @@ void suspend_console(void)
18761         acquire_console_sem();
18762         console_suspended = 1;
18763  }
18764 +EXPORT_SYMBOL(suspend_console);
18765  
18766  void resume_console(void)
18767  {
18768 @@ -945,6 +952,7 @@ void resume_console(void)
18769         console_suspended = 0;
18770         release_console_sem();
18771  }
18772 +EXPORT_SYMBOL(resume_console);
18773  
18774  /**
18775   * acquire_console_sem - lock the console system for exclusive use.
18776 diff --git a/kernel/timer.c b/kernel/timer.c
18777 index 03bc7f1..d70831d 100644
18778 --- a/kernel/timer.c
18779 +++ b/kernel/timer.c
18780 @@ -37,6 +37,8 @@
18781  #include <linux/delay.h>
18782  #include <linux/tick.h>
18783  #include <linux/kallsyms.h>
18784 +#include <linux/notifier.h>
18785 +#include <linux/suspend.h>
18786  
18787  #include <asm/uaccess.h>
18788  #include <asm/unistd.h>
18789 @@ -1002,6 +1004,59 @@ unsigned long avenrun[3];
18790  
18791  EXPORT_SYMBOL(avenrun);
18792  
18793 +#ifdef CONFIG_PM
18794 +static unsigned long avenrun_save[3];
18795 +/*
18796 + * save_avenrun - Record the values prior to starting a hibernation cycle.
18797 + * We do this to make the work done in hibernation invisible to userspace
18798 + * post-suspend. Some programs, including some MTAs, watch the load average
18799 + * and stop work until it lowers. Without this, they would stop working for
18800 + * a while post-resume, unnecessarily.
18801 + */
18802 +
18803 +static void save_avenrun(void)
18804 +{
18805 +       avenrun_save[0] = avenrun[0];
18806 +       avenrun_save[1] = avenrun[1];
18807 +       avenrun_save[2] = avenrun[2];
18808 +}
18809 +
18810 +static void restore_avenrun(void)
18811 +{
18812 +       if (!avenrun_save[0])
18813 +               return;
18814 +
18815 +       avenrun[0] = avenrun_save[0];
18816 +       avenrun[1] = avenrun_save[1];
18817 +       avenrun[2] = avenrun_save[2];
18818 +
18819 +       avenrun_save[0] = 0;
18820 +}
18821 +
18822 +static int avenrun_pm_callback(struct notifier_block *nfb,
18823 +                                       unsigned long action,
18824 +                                       void *ignored)
18825 +{
18826 +       switch (action) {
18827 +       case PM_HIBERNATION_PREPARE:
18828 +               save_avenrun();
18829 +               return NOTIFY_OK;
18830 +       case PM_POST_HIBERNATION:
18831 +               restore_avenrun();
18832 +               return NOTIFY_OK;
18833 +       }
18834 +
18835 +       return NOTIFY_DONE;
18836 +}
18837 +
18838 +static void register_pm_notifier_callback(void)
18839 +{
18840 +       pm_notifier(avenrun_pm_callback, 0);
18841 +}
18842 +#else
18843 +static inline void register_pm_notifier_callback(void) { }
18844 +#endif
18845 +
18846  /*
18847   * calc_load - given tick count, update the avenrun load estimates.
18848   * This is called while holding a write_lock on xtime_lock.
18849 @@ -1495,6 +1550,7 @@ void __init init_timers(void)
18850         BUG_ON(err == NOTIFY_BAD);
18851         register_cpu_notifier(&timers_nb);
18852         open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
18853 +       register_pm_notifier_callback();
18854  }
18855  
18856  /**
18857 diff --git a/lib/vsprintf.c b/lib/vsprintf.c
18858 index c399bc1..42e7072 100644
18859 --- a/lib/vsprintf.c
18860 +++ b/lib/vsprintf.c
18861 @@ -558,6 +558,29 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
18862         return number(buf, end, (unsigned long) ptr, 16, field_width, precision, flags);
18863  }
18864  
18865 +/*
18866 + * vsnprintf_used
18867 + *
18868 + * Functionality    : Print a string with parameters to a buffer of a
18869 + *                    limited size. Unlike vsnprintf, we return the number
18870 + *                    of bytes actually put in the buffer, not the number
18871 + *                    that would have been put in if it was big enough.
18872 + */
18873 +int snprintf_used(char *buffer, int buffer_size, const char *fmt, ...)
18874 +{
18875 +       int result;
18876 +       va_list args;
18877 +
18878 +       if (!buffer_size)
18879 +               return 0;
18880 +
18881 +       va_start(args, fmt);
18882 +       result = vsnprintf(buffer, buffer_size, fmt, args);
18883 +       va_end(args);
18884 +
18885 +       return result > buffer_size ? buffer_size : result;
18886 +}
18887 +
18888  /**
18889   * vsnprintf - Format a string and place it in a buffer
18890   * @buf: The buffer to place the result into
18891 diff --git a/mm/Makefile b/mm/Makefile
18892 index da4ccf0..40d7f55 100644
18893 --- a/mm/Makefile
18894 +++ b/mm/Makefile
18895 @@ -11,7 +11,7 @@ obj-y                 := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
18896                            maccess.o page_alloc.o page-writeback.o pdflush.o \
18897                            readahead.o swap.o truncate.o vmscan.o \
18898                            prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
18899 -                          page_isolation.o mm_init.o $(mmu-y)
18900 +                          dyn_pageflags.o page_isolation.o mm_init.o $(mmu-y)
18901  
18902  obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
18903  obj-$(CONFIG_BOUNCE)   += bounce.o
18904 diff --git a/mm/dyn_pageflags.c b/mm/dyn_pageflags.c
18905 new file mode 100644
18906 index 0000000..30d95f0
18907 --- /dev/null
18908 +++ b/mm/dyn_pageflags.c
18909 @@ -0,0 +1,801 @@
18910 +/*
18911 + * lib/dyn_pageflags.c
18912 + *
18913 + * Copyright (C) 2004-2007 Nigel Cunningham <nigel at tuxonice net>
18914 + *
18915 + * This file is released under the GPLv2.
18916 + *
18917 + * Routines for dynamically allocating and releasing bitmaps
18918 + * used as pseudo-pageflags.
18919 + *
18920 + * We use bitmaps, built out of order zero allocations and
18921 + * linked together by kzalloc'd arrays of pointers into
18922 + * an array that looks like...
18923 + *
18924 + *     pageflags->bitmap[node][zone_id][page_num][ul]
18925 + *
18926 + * All of this is transparent to the caller, who just uses
18927 + * the allocate & free routines to create/destroy bitmaps,
18928 + * and get/set/clear to operate on individual flags.
18929 + *
18930 + * Bitmaps can be sparse, with the individual pages only being
18931 + * allocated when a bit is set in the page.
18932 + *
18933 + * Memory hotplugging support is work in progress. A zone's
18934 + * start_pfn may change. If it does, we need to reallocate
18935 + * the zone bitmap, adding additional pages to the front to
18936 + * cover the bitmap. For simplicity, we don't shift the
18937 + * contents of existing pages around. The lock is only used
18938 + * to avoid reentrancy when resizing zones. The replacement
18939 + * of old data with new is done atomically. If we try to test
18940 + * a bit in the new area before the update is completed, we
18941 + * know it's zero.
18942 + *
18943 + * TuxOnIce knows the structure of these pageflags, so that
18944 + * it can serialise them in the image header. TODO: Make
18945 + * that support more generic so that TuxOnIce doesn't need
18946 + * to know how dyn_pageflags are stored.
18947 + */
18948 +
18949 +/* Avoid warnings in include/linux/mm.h */
18950 +struct page;
18951 +struct dyn_pageflags;
18952 +int test_dynpageflag(struct dyn_pageflags *bitmap, struct page *page);
18953 +
18954 +#include <linux/bootmem.h>
18955 +#include <linux/dyn_pageflags.h>
18956 +#include <linux/module.h>
18957 +
18958 +static LIST_HEAD(flags_list);
18959 +static DEFINE_SPINLOCK(flags_list_lock);
18960 +
18961 +static void* (*dyn_allocator)(unsigned long size, unsigned long flags);
18962 +
18963 +static int dyn_pageflags_debug;
18964 +
18965 +#define PR_DEBUG(a, b...) \
18966 +       do { if (dyn_pageflags_debug) printk(a, ##b); } while (0)
18967 +#define DUMP_DEBUG(bitmap) \
18968 +       do { if (dyn_pageflags_debug) dump_pagemap(bitmap); } while (0)
18969 +
18970 +#if BITS_PER_LONG == 32
18971 +#define UL_SHIFT 5
18972 +#else
18973 +#if BITS_PER_LONG == 64
18974 +#define UL_SHIFT 6
18975 +#else
18976 +#error Bits per long not 32 or 64?
18977 +#endif
18978 +#endif
18979 +
18980 +#define BIT_NUM_MASK ((sizeof(unsigned long) << 3) - 1)
18981 +#define PAGE_NUM_MASK (~((1 << (PAGE_SHIFT + 3)) - 1))
18982 +#define UL_NUM_MASK (~(BIT_NUM_MASK | PAGE_NUM_MASK))
18983 +
18984 +/*
18985 + * PAGENUMBER gives the index of the page within the zone.
18986 + * PAGEINDEX gives the index of the unsigned long within that page.
18987 + * PAGEBIT gives the index of the bit within the unsigned long.
18988 + */
18989 +#define PAGENUMBER(zone_offset) ((int) (zone_offset >> (PAGE_SHIFT + 3)))
18990 +#define PAGEINDEX(zone_offset) ((int) ((zone_offset & UL_NUM_MASK) >> UL_SHIFT))
18991 +#define PAGEBIT(zone_offset) ((int) (zone_offset & BIT_NUM_MASK))
18992 +
18993 +#define PAGE_UL_PTR(bitmap, node, zone_num, zone_pfn) \
18994 +       ((bitmap[node][zone_num][PAGENUMBER(zone_pfn)])+PAGEINDEX(zone_pfn))
18995 +
18996 +#define pages_for_zone(zone) \
18997 +       (DIV_ROUND_UP((zone)->spanned_pages, (PAGE_SIZE << 3)))
18998 +
18999 +#define pages_for_span(span) \
19000 +       (DIV_ROUND_UP(span, PAGE_SIZE << 3))
19001 +
19002 +/* __maybe_unused for testing functions below */
19003 +#define GET_BIT_AND_UL(pageflags, page) \
19004 +       struct zone *zone = page_zone(page); \
19005 +       unsigned long pfn = page_to_pfn(page); \
19006 +       unsigned long zone_pfn = pfn - zone->zone_start_pfn; \
19007 +       int node = page_to_nid(page); \
19008 +       int zone_num = zone_idx(zone); \
19009 +       int pagenum = PAGENUMBER(zone_pfn) + 2; \
19010 +       int page_offset = PAGEINDEX(zone_pfn); \
19011 +       unsigned long **zone_array = ((pageflags)->bitmap && \
19012 +               (pageflags)->bitmap[node] && \
19013 +               (pageflags)->bitmap[node][zone_num]) ? \
19014 +                       (pageflags)->bitmap[node][zone_num] : NULL; \
19015 +       unsigned long __maybe_unused *ul = (zone_array && \
19016 +               (unsigned long) zone_array[0] <= pfn && \
19017 +               (unsigned long) zone_array[1] >= (pagenum-2) && \
19018 +               zone_array[pagenum]) ? zone_array[pagenum] + page_offset : \
19019 +                 NULL; \
19020 +       int bit __maybe_unused = PAGEBIT(zone_pfn);
19021 +
19022 +#define for_each_online_pgdat_zone(pgdat, zone_nr) \
19023 +       for_each_online_pgdat(pgdat) \
19024 +               for (zone_nr = 0; zone_nr < MAX_NR_ZONES; zone_nr++)
19025 +
19026 +/**
19027 + * dump_pagemap - Display the contents of a bitmap for debugging purposes.
19028 + *
19029 + * @pagemap: The array to be dumped.
19030 + */
19031 +void dump_pagemap(struct dyn_pageflags *pagemap)
19032 +{
19033 +       int i = 0;
19034 +       struct pglist_data *pgdat;
19035 +       unsigned long ****bitmap = pagemap->bitmap;
19036 +
19037 +       printk(" --- Dump bitmap %p ---\n", pagemap);
19038 +
19039 +       printk(KERN_INFO "%p: Sparse flag = %d\n",
19040 +                       &pagemap->sparse, pagemap->sparse);
19041 +       printk(KERN_INFO "%p: Bitmap      = %p\n",
19042 +                       &pagemap->bitmap, bitmap);
19043 +
19044 +       if (!bitmap)
19045 +               goto out;
19046 +
19047 +       for_each_online_pgdat(pgdat) {
19048 +               int node_id = pgdat->node_id, zone_nr;
19049 +               printk(KERN_INFO "%p: Node %d => %p\n",
19050 +                               &bitmap[node_id], node_id,
19051 +                               bitmap[node_id]);
19052 +               if (!bitmap[node_id])
19053 +                       continue;
19054 +               for (zone_nr = 0; zone_nr < MAX_NR_ZONES; zone_nr++) {
19055 +                       printk(KERN_INFO "%p:   Zone %d => %p%s\n",
19056 +                                       &bitmap[node_id][zone_nr], zone_nr,
19057 +                                       bitmap[node_id][zone_nr],
19058 +                                       bitmap[node_id][zone_nr] ? "" :
19059 +                                               " (empty)");
19060 +                       if (!bitmap[node_id][zone_nr])
19061 +                               continue;
19062 +
19063 +                       printk(KERN_INFO "%p:     Zone start pfn  = %p\n",
19064 +                                       &bitmap[node_id][zone_nr][0],
19065 +                                       bitmap[node_id][zone_nr][0]);
19066 +                       printk(KERN_INFO "%p:     Number of pages = %p\n",
19067 +                                       &bitmap[node_id][zone_nr][1],
19068 +                                       bitmap[node_id][zone_nr][1]);
19069 +                       for (i = 2; i < (unsigned long) bitmap[node_id]
19070 +                                       [zone_nr][1] + 2; i++)
19071 +                               printk(KERN_INFO
19072 +                                       "%p:     Page %2d         = %p\n",
19073 +                                       &bitmap[node_id][zone_nr][i],
19074 +                                       i - 2,
19075 +                                       bitmap[node_id][zone_nr][i]);
19076 +               }
19077 +       }
19078 +out:
19079 +       printk(KERN_INFO " --- Dump of bitmap %p finishes\n", pagemap);
19080 +}
19081 +EXPORT_SYMBOL_GPL(dump_pagemap);
19082 +
19083 +/**
19084 + * clear_dyn_pageflags - Zero all pageflags in a bitmap.
19085 + *
19086 + * @pagemap: The array to be cleared.
19087 + *
19088 + * Clear an array used to store dynamically allocated pageflags.
19089 + */
19090 +void clear_dyn_pageflags(struct dyn_pageflags *pagemap)
19091 +{
19092 +       int i = 0, zone_idx;
19093 +       struct pglist_data *pgdat;
19094 +       unsigned long ****bitmap = pagemap->bitmap;
19095 +
19096 +       for_each_online_pgdat_zone(pgdat, zone_idx) {
19097 +               int node_id = pgdat->node_id;
19098 +               struct zone *zone = &pgdat->node_zones[zone_idx];
19099 +
19100 +               if (!populated_zone(zone) ||
19101 +                  (!bitmap[node_id] || !bitmap[node_id][zone_idx]))
19102 +                       continue;
19103 +
19104 +               for (i = 2; i < pages_for_zone(zone) + 2; i++)
19105 +                       if (bitmap[node_id][zone_idx][i])
19106 +                               memset((bitmap[node_id][zone_idx][i]), 0,
19107 +                                               PAGE_SIZE);
19108 +       }
19109 +}
19110 +EXPORT_SYMBOL_GPL(clear_dyn_pageflags);
19111 +
19112 +/**
19113 + * Allocators.
19114 + *
19115 + * During boot time, we want to use alloc_bootmem_low. Afterwards, we want
19116 + * kzalloc. These routines let us do that without causing compile time warnings
19117 + * about mismatched sections, as would happen if we did a simple
19118 + * boot ? alloc_bootmem_low() : kzalloc() below.
19119 + */
19120 +
19121 +/**
19122 + * boot_time_allocator - Allocator used while booting.
19123 + *
19124 + * @size: Number of bytes wanted.
19125 + * @flags: Allocation flags (ignored here).
19126 + */
19127 +static __init void *boot_time_allocator(unsigned long size, unsigned long flags)
19128 +{
19129 +       return alloc_bootmem_low(size);
19130 +}
19131 +
19132 +/**
19133 + * normal_allocator - Allocator used post-boot.
19134 + *
19135 + * @size: Number of bytes wanted.
19136 + * @flags: Allocation flags.
19137 + *
19138 + * Allocate memory for our page flags.
19139 + */
19140 +static void *normal_allocator(unsigned long size, unsigned long flags)
19141 +{
19142 +       if (size == PAGE_SIZE)
19143 +               return (void *) get_zeroed_page(flags);
19144 +       else
19145 +               return kzalloc(size, flags);
19146 +}
19147 +
19148 +/**
19149 + * dyn_pageflags_init - Do the earliest initialisation.
19150 + *
19151 + * Very early in the boot process, set our allocator (alloc_bootmem_low) and
19152 + * allocate bitmaps for slab and buddy pageflags.
19153 + */
19154 +void __init dyn_pageflags_init(void)
19155 +{
19156 +       dyn_allocator = boot_time_allocator;
19157 +}
19158 +
19159 +/**
19160 + * dyn_pageflags_use_kzalloc - Reset the allocator for normal use.
19161 + *
19162 + * Reset the allocator to our normal, post boot function.
19163 + */
19164 +void __init dyn_pageflags_use_kzalloc(void)
19165 +{
19166 +       dyn_allocator = (void *) normal_allocator;
19167 +}
19168 +
19169 +/**
19170 + * try_alloc_dyn_pageflag_part - Try to allocate a pointer array.
19171 + *
19172 + * Try to allocate a contiguous array of pointers.
19173 + */
19174 +static int try_alloc_dyn_pageflag_part(int nr_ptrs, void **ptr)
19175 +{
19176 +       *ptr = (*dyn_allocator)(sizeof(void *) * nr_ptrs, GFP_ATOMIC);
19177 +
19178 +       if (*ptr)
19179 +               return 0;
19180 +
19181 +       printk(KERN_INFO
19182 +               "Error. Unable to allocate memory for dynamic pageflags.");
19183 +       return -ENOMEM;
19184 +}
19185 +
19186 +static int populate_bitmap_page(struct dyn_pageflags *pageflags, int take_lock,
19187 +                       unsigned long **page_ptr)
19188 +{
19189 +       void *address;
19190 +       unsigned long flags = 0;
19191 +
19192 +       if (take_lock)
19193 +               spin_lock_irqsave(&pageflags->struct_lock, flags);
19194 +
19195 +       /*
19196 +        * The page may have been allocated while we waited.
19197 +        */
19198 +       if (*page_ptr)
19199 +               goto out;
19200 +
19201 +       address = (*dyn_allocator)(PAGE_SIZE, GFP_ATOMIC);
19202 +
19203 +       if (!address) {
19204 +               PR_DEBUG("Error. Unable to allocate memory for "
19205 +                       "dynamic pageflags page.");
19206 +               if (pageflags)
19207 +                       spin_unlock_irqrestore(&pageflags->struct_lock, flags);
19208 +               return -ENOMEM;
19209 +       }
19210 +
19211 +       *page_ptr = address;
19212 +out:
19213 +       if (take_lock)
19214 +               spin_unlock_irqrestore(&pageflags->struct_lock, flags);
19215 +       return 0;
19216 +}
19217 +
19218 +/**
19219 + * resize_zone_bitmap - Resize the array of pages for a bitmap.
19220 + *
19221 + * Shrink or extend a list of pages for a zone in a bitmap, preserving
19222 + * existing data.
19223 + */
19224 +static int resize_zone_bitmap(struct dyn_pageflags *pagemap, struct zone *zone,
19225 +               unsigned long old_pages, unsigned long new_pages,
19226 +               unsigned long copy_offset, int take_lock)
19227 +{
19228 +       unsigned long **new_ptr = NULL, ****bitmap = pagemap->bitmap;
19229 +       int node_id = zone_to_nid(zone), zone_idx = zone_idx(zone),
19230 +           to_copy = min(old_pages, new_pages), result = 0;
19231 +       unsigned long **old_ptr = bitmap[node_id][zone_idx], i;
19232 +
19233 +       if (new_pages) {
19234 +               if (try_alloc_dyn_pageflag_part(new_pages + 2,
19235 +                                       (void **) &new_ptr))
19236 +                       return -ENOMEM;
19237 +
19238 +               if (old_pages)
19239 +                       memcpy(new_ptr + 2 + copy_offset, old_ptr + 2,
19240 +                                       sizeof(unsigned long) * to_copy);
19241 +
19242 +               new_ptr[0] = (void *) zone->zone_start_pfn;
19243 +               new_ptr[1] = (void *) new_pages;
19244 +       }
19245 +
19246 +       /* Free/alloc bitmap pages. */
19247 +       if (old_pages > new_pages) {
19248 +               for (i = new_pages + 2; i < old_pages + 2; i++)
19249 +                       if (old_ptr[i])
19250 +                               free_page((unsigned long) old_ptr[i]);
19251 +       } else if (!pagemap->sparse) {
19252 +               for (i = old_pages + 2; i < new_pages + 2; i++)
19253 +                       if (populate_bitmap_page(NULL, take_lock,
19254 +                                       (unsigned long **) &new_ptr[i])) {
19255 +                               result = -ENOMEM;
19256 +                               break;
19257 +                       }
19258 +       }
19259 +
19260 +       bitmap[node_id][zone_idx] = new_ptr;
19261 +       kfree(old_ptr);
19262 +       return result;
19263 +}
19264 +
19265 +/**
19266 + * check_dyn_pageflag_range - Resize a section of a dyn_pageflag array.
19267 + *
19268 + * @pagemap: The array to be worked on.
19269 + * @zone: The zone to get in sync with reality.
19270 + *
19271 + * Check the pagemap has correct allocations for the zone. This can be
19272 + * invoked when allocating a new bitmap, or for hot[un]plug, and so
19273 + * must deal with any disparities between zone_start_pfn/spanned_pages
19274 + * and what we have allocated. In addition, we must deal with the possibility
19275 + * of zone_start_pfn having changed.
19276 + */
19277 +int check_dyn_pageflag_zone(struct dyn_pageflags *pagemap, struct zone *zone,
19278 +               int force_free_all, int take_lock)
19279 +{
19280 +       int node_id = zone_to_nid(zone), zone_idx = zone_idx(zone);
19281 +       unsigned long copy_offset = 0, old_pages, new_pages;
19282 +       unsigned long **old_ptr = pagemap->bitmap[node_id][zone_idx];
19283 +
19284 +       old_pages = old_ptr ? (unsigned long) old_ptr[1] : 0;
19285 +       new_pages = force_free_all ? 0 : pages_for_span(zone->spanned_pages);
19286 +
19287 +       if (old_pages == new_pages &&
19288 +           (!old_pages || (unsigned long) old_ptr[0] == zone->zone_start_pfn))
19289 +               return 0;
19290 +
19291 +       if (old_pages &&
19292 +           (unsigned long) old_ptr[0] != zone->zone_start_pfn)
19293 +               copy_offset = pages_for_span((unsigned long) old_ptr[0] -
19294 +                                                       zone->zone_start_pfn);
19295 +
19296 +       /* New/expanded zone? */
19297 +       return resize_zone_bitmap(pagemap, zone, old_pages, new_pages,
19298 +                       copy_offset, take_lock);
19299 +}
19300 +
19301 +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
19302 +/**
19303 + * dyn_pageflags_hotplug - Add pages to bitmaps for hotplugged memory.
19304 + *
19305 + * Seek to expand bitmaps for hotplugged memory. We ignore any failure.
19306 + * Since we handle sparse bitmaps anyway, they'll be automatically
19307 + * populated as needed.
19308 + */
19309 +void dyn_pageflags_hotplug(struct zone *zone)
19310 +{
19311 +       struct dyn_pageflags *this;
19312 +
19313 +       list_for_each_entry(this, &flags_list, list)
19314 +               check_dyn_pageflag_zone(this, zone, 0, 1);
19315 +}
19316 +#endif
19317 +
19318 +/**
19319 + * free_dyn_pageflags - Free an array of dynamically allocated pageflags.
19320 + *
19321 + * @pagemap: The array to be freed.
19322 + *
19323 + * Free a dynamically allocated pageflags bitmap.
19324 + */
19325 +void free_dyn_pageflags(struct dyn_pageflags *pagemap)
19326 +{
19327 +       int zone_idx;
19328 +       struct pglist_data *pgdat;
19329 +       unsigned long flags;
19330 +
19331 +       DUMP_DEBUG(pagemap);
19332 +
19333 +       if (!pagemap->bitmap)
19334 +               return;
19335 +
19336 +       for_each_online_pgdat_zone(pgdat, zone_idx)
19337 +               check_dyn_pageflag_zone(pagemap,
19338 +                               &pgdat->node_zones[zone_idx], 1, 1);
19339 +
19340 +       for_each_online_pgdat(pgdat) {
19341 +               int i = pgdat->node_id;
19342 +
19343 +               if (pagemap->bitmap[i])
19344 +                       kfree((pagemap->bitmap)[i]);
19345 +       }
19346 +
19347 +       kfree(pagemap->bitmap);
19348 +       pagemap->bitmap = NULL;
19349 +
19350 +       pagemap->initialised = 0;
19351 +
19352 +       if (!pagemap->sparse) {
19353 +               spin_lock_irqsave(&flags_list_lock, flags);
19354 +               list_del_init(&pagemap->list);
19355 +               pagemap->sparse = 1;
19356 +               spin_unlock_irqrestore(&flags_list_lock, flags);
19357 +       }
19358 +}
19359 +EXPORT_SYMBOL_GPL(free_dyn_pageflags);
19360 +
19361 +/**
19362 + * allocate_dyn_pageflags - Allocate a bitmap.
19363 + *
19364 + * @pagemap: The bitmap we want to allocate.
19365 + * @sparse: Whether to make the array sparse.
19366 + *
19367 + * The array we're preparing. If sparse, we don't allocate the actual
19368 + * pages until they're needed. If not sparse, we add the bitmap to the
19369 + * list so that if we're supporting memory hotplugging, we can allocate
19370 + * new pages on hotplug events.
19371 + *
19372 + * This routine may be called directly, or indirectly when the first bit
19373 + * needs to be set on a previously unused bitmap.
19374 + */
19375 +int allocate_dyn_pageflags(struct dyn_pageflags *pagemap, int sparse)
19376 +{
19377 +       int zone_idx, result = -ENOMEM;
19378 +       struct zone *zone;
19379 +       struct pglist_data *pgdat;
19380 +       unsigned long flags;
19381 +
19382 +       if (!sparse && (pagemap->sparse || !pagemap->initialised)) {
19383 +               spin_lock_irqsave(&flags_list_lock, flags);
19384 +               list_add(&pagemap->list, &flags_list);
19385 +               spin_unlock_irqrestore(&flags_list_lock, flags);
19386 +       }
19387 +
19388 +       spin_lock_irqsave(&pagemap->struct_lock, flags);
19389 +
19390 +       pagemap->initialised = 1;
19391 +       pagemap->sparse = sparse;
19392 +
19393 +       if (!pagemap->bitmap && try_alloc_dyn_pageflag_part((1 << NODES_WIDTH),
19394 +                               (void **) &pagemap->bitmap))
19395 +               goto out;
19396 +
19397 +       for_each_online_pgdat(pgdat) {
19398 +               int node_id = pgdat->node_id;
19399 +
19400 +               if (!pagemap->bitmap[node_id] &&
19401 +                   try_alloc_dyn_pageflag_part(MAX_NR_ZONES,
19402 +                       (void **) &(pagemap->bitmap)[node_id]))
19403 +                               goto out;
19404 +
19405 +               for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
19406 +                       zone = &pgdat->node_zones[zone_idx];
19407 +
19408 +                       if (populated_zone(zone) &&
19409 +                           check_dyn_pageflag_zone(pagemap, zone, 0, 0))
19410 +                               goto out;
19411 +               }
19412 +       }
19413 +
19414 +       result = 0;
19415 +
19416 +out:
19417 +       spin_unlock_irqrestore(&pagemap->struct_lock, flags);
19418 +       return result;
19419 +}
19420 +EXPORT_SYMBOL_GPL(allocate_dyn_pageflags);
19421 +
19422 +/**
19423 + * test_dynpageflag - Test a page in a bitmap.
19424 + *
19425 + * @bitmap: The bitmap we're checking.
19426 + * @page: The page for which we want to test the matching bit.
19427 + *
19428 + * Test whether the bit is on in the array. The array may be sparse,
19429 + * in which case the result is zero.
19430 + */
19431 +int test_dynpageflag(struct dyn_pageflags *bitmap, struct page *page)
19432 +{
19433 +       GET_BIT_AND_UL(bitmap, page);
19434 +       return ul ? test_bit(bit, ul) : 0;
19435 +}
19436 +EXPORT_SYMBOL_GPL(test_dynpageflag);
19437 +
19438 +/**
19439 + * set_dynpageflag - Set a bit in a bitmap.
19440 + *
19441 + * @bitmap: The bitmap we're operating on.
19442 + * @page: The page for which we want to set the matching bit.
19443 + *
19444 + * Set the associated bit in the array. If the array is sparse, we
19445 + * seek to allocate the missing page.
19446 + */
19447 +void set_dynpageflag(struct dyn_pageflags *pageflags, struct page *page)
19448 +{
19449 +       GET_BIT_AND_UL(pageflags, page);
19450 +
19451 +       if (!ul) {
19452 +               /*
19453 +                * Sparse, hotplugged or unprepared.
19454 +                * Allocate / fill gaps in high levels
19455 +                */
19456 +               if (allocate_dyn_pageflags(pageflags, 1) ||
19457 +                   populate_bitmap_page(pageflags, 1, (unsigned long **)
19458 +                               &pageflags->bitmap[node][zone_num][pagenum])) {
19459 +                       printk(KERN_EMERG "Failed to allocate storage in a "
19460 +                                       "sparse bitmap.\n");
19461 +                       dump_pagemap(pageflags);
19462 +                       BUG();
19463 +               }
19464 +               set_dynpageflag(pageflags, page);
19465 +       } else
19466 +               set_bit(bit, ul);
19467 +}
19468 +EXPORT_SYMBOL_GPL(set_dynpageflag);
19469 +
19470 +/**
19471 + * clear_dynpageflag - Clear a bit in a bitmap.
19472 + *
19473 + * @bitmap: The bitmap we're operating on.
19474 + * @page: The page for which we want to clear the matching bit.
19475 + *
19476 + * Clear the associated bit in the array. It is not an error to be asked
19477 + * to clear a bit on a page we haven't allocated.
19478 + */
19479 +void clear_dynpageflag(struct dyn_pageflags *bitmap, struct page *page)
19480 +{
19481 +       GET_BIT_AND_UL(bitmap, page);
19482 +       if (ul)
19483 +               clear_bit(bit, ul);
19484 +}
19485 +EXPORT_SYMBOL_GPL(clear_dynpageflag);
19486 +
19487 +/**
19488 + * get_next_bit_on - Get the next bit in a bitmap.
19489 + *
19490 + * @pageflags: The bitmap we're searching.
19491 + * @counter: The previous pfn. We always return a value > this.
19492 + *
19493 + * Given a pfn (possibly max_pfn+1), find the next pfn in the bitmap that
19494 + * is set. If there are no more flags set, return max_pfn+1.
19495 + */
19496 +unsigned long get_next_bit_on(struct dyn_pageflags *pageflags,
19497 +               unsigned long counter)
19498 +{
19499 +       struct page *page;
19500 +       struct zone *zone;
19501 +       unsigned long *ul = NULL;
19502 +       unsigned long zone_offset;
19503 +       int pagebit, zone_num, first = (counter == (max_pfn + 1)), node;
19504 +
19505 +       if (first)
19506 +               counter = first_online_pgdat()->node_zones->zone_start_pfn;
19507 +
19508 +       page = pfn_to_page(counter);
19509 +       zone = page_zone(page);
19510 +       node = zone->zone_pgdat->node_id;
19511 +       zone_num = zone_idx(zone);
19512 +       zone_offset = counter - zone->zone_start_pfn;
19513 +
19514 +       if (first)
19515 +               goto test;
19516 +
19517 +       do {
19518 +               zone_offset++;
19519 +
19520 +               if (zone_offset >= zone->spanned_pages) {
19521 +                       do {
19522 +                               zone = next_zone(zone);
19523 +                               if (!zone)
19524 +                                       return max_pfn + 1;
19525 +                       } while (!zone->spanned_pages);
19526 +
19527 +                       zone_num = zone_idx(zone);
19528 +                       node = zone->zone_pgdat->node_id;
19529 +                       zone_offset = 0;
19530 +               }
19531 +test:
19532 +               pagebit = PAGEBIT(zone_offset);
19533 +
19534 +               if (!pagebit || !ul) {
19535 +                       ul = pageflags->bitmap[node][zone_num]
19536 +                               [PAGENUMBER(zone_offset)+2];
19537 +                       if (ul)
19538 +                               ul += PAGEINDEX(zone_offset);
19539 +                       else {
19540 +                               PR_DEBUG("Unallocated page. Skipping from zone"
19541 +                                       " offset %lu to the start of the next "
19542 +                                       "one.\n", zone_offset);
19543 +                               zone_offset = roundup(zone_offset + 1,
19544 +                                               PAGE_SIZE << 3) - 1;
19545 +                               PR_DEBUG("New zone offset is %lu.\n",
19546 +                                               zone_offset);
19547 +                               continue;
19548 +                       }
19549 +               }
19550 +
19551 +               if (!ul || !(*ul & ~((1 << pagebit) - 1))) {
19552 +                       zone_offset += BITS_PER_LONG - pagebit - 1;
19553 +                       continue;
19554 +               }
19555 +
19556 +       } while (!ul || !test_bit(pagebit, ul));
19557 +
19558 +       return zone->zone_start_pfn + zone_offset;
19559 +}
19560 +EXPORT_SYMBOL_GPL(get_next_bit_on);
19561 +
19562 +#ifdef SELF_TEST
19563 +#include <linux/jiffies.h>
19564 +
19565 +static __init int dyn_pageflags_test(void)
19566 +{
19567 +       struct dyn_pageflags test_map;
19568 +       struct page *test_page1 = pfn_to_page(1);
19569 +       unsigned long pfn = 0, start, end;
19570 +       int i, iterations;
19571 +
19572 +       memset(&test_map, 0, sizeof(test_map));
19573 +
19574 +       printk("Dynpageflags testing...\n");
19575 +
19576 +       printk(KERN_INFO "Set page 1...");
19577 +       set_dynpageflag(&test_map, test_page1);
19578 +       if (test_dynpageflag(&test_map, test_page1))
19579 +               printk(KERN_INFO "Ok.\n");
19580 +       else
19581 +               printk(KERN_INFO "FAILED.\n");
19582 +
19583 +       printk(KERN_INFO "Test memory hotplugging #1 ...");
19584 +       {
19585 +               unsigned long orig_size;
19586 +               GET_BIT_AND_UL(&test_map, test_page1);
19587 +               orig_size = (unsigned long) test_map.bitmap[node][zone_num][1];
19588 +               /*
19589 +                * Use the code triggered when zone_start_pfn lowers,
19590 +                * checking that our bit is then set in the third page.
19591 +                */
19592 +               resize_zone_bitmap(&test_map, zone, orig_size,
19593 +                               orig_size + 2, 2);
19594 +               DUMP_DEBUG(&test_map);
19595 +               if ((unsigned long) test_map.bitmap[node][zone_num]
19596 +                               [pagenum + 2] &&
19597 +                   (unsigned long) test_map.bitmap[node][zone_num]
19598 +                               [pagenum + 2][0] == 2UL)
19599 +                       printk(KERN_INFO "Ok.\n");
19600 +               else
19601 +                       printk(KERN_INFO "FAILED.\n");
19602 +       }
19603 +
19604 +       printk(KERN_INFO "Test memory hotplugging #2 ...");
19605 +       {
19606 +               /*
19607 +                * Test expanding bitmap length.
19608 +                */
19609 +               unsigned long orig_size;
19610 +               GET_BIT_AND_UL(&test_map, test_page1);
19611 +               orig_size = (unsigned long) test_map.bitmap[node]
19612 +                                                       [zone_num][1];
19613 +               resize_zone_bitmap(&test_map, zone, orig_size,
19614 +                               orig_size + 2, 0);
19615 +               DUMP_DEBUG(&test_map);
19616 +               pagenum += 2; /* Offset for first test */
19617 +               if (test_map.bitmap[node][zone_num][pagenum] &&
19618 +                   test_map.bitmap[node][zone_num][pagenum][0] == 2UL &&
19619 +                   (unsigned long) test_map.bitmap[node][zone_num][1] ==
19620 +                                               orig_size + 2)
19621 +                       printk(KERN_INFO "Ok.\n");
19622 +               else
19623 +                       printk(KERN_INFO "FAILED ([%d][%d][%d]: %p && %lu == "
19624 +                               "2UL  && %p == %lu).\n",
19625 +                               node, zone_num, pagenum,
19626 +                               test_map.bitmap[node][zone_num][pagenum],
19627 +                               test_map.bitmap[node][zone_num][pagenum] ?
19628 +                               test_map.bitmap[node][zone_num][pagenum][0] : 0,
19629 +                               test_map.bitmap[node][zone_num][1],
19630 +                               orig_size + 2);
19631 +       }
19632 +
19633 +       free_dyn_pageflags(&test_map);
19634 +
19635 +       allocate_dyn_pageflags(&test_map, 0);
19636 +
19637 +       start = jiffies;
19638 +
19639 +       iterations = 25000000 / max_pfn;
19640 +
19641 +       for (i = 0; i < iterations; i++) {
19642 +               for (pfn = 0; pfn < max_pfn; pfn++)
19643 +                       set_dynpageflag(&test_map, pfn_to_page(pfn));
19644 +               for (pfn = 0; pfn < max_pfn; pfn++)
19645 +                       clear_dynpageflag(&test_map, pfn_to_page(pfn));
19646 +       }
19647 +
19648 +       end = jiffies;
19649 +
19650 +       free_dyn_pageflags(&test_map);
19651 +
19652 +       printk(KERN_INFO "Dyn: %d iterations of setting & clearing all %lu "
19653 +                       "flags took %lu jiffies.\n",
19654 +                       iterations, max_pfn, end - start);
19655 +
19656 +       start = jiffies;
19657 +
19658 +       for (i = 0; i < iterations; i++) {
19659 +               for (pfn = 0; pfn < max_pfn; pfn++)
19660 +                       set_bit(7, &(pfn_to_page(pfn))->flags);
19661 +               for (pfn = 0; pfn < max_pfn; pfn++)
19662 +                       clear_bit(7, &(pfn_to_page(pfn))->flags);
19663 +       }
19664 +
19665 +       end = jiffies;
19666 +
19667 +       printk(KERN_INFO "Real flags: %d iterations of setting & clearing "
19668 +                       "all %lu flags took %lu jiffies.\n",
19669 +                       iterations, max_pfn, end - start);
19670 +
19671 +       iterations = 25000000;
19672 +
19673 +       start = jiffies;
19674 +
19675 +       for (i = 0; i < iterations; i++) {
19676 +               set_dynpageflag(&test_map, pfn_to_page(1));
19677 +               clear_dynpageflag(&test_map, pfn_to_page(1));
19678 +       }
19679 +
19680 +       end = jiffies;
19681 +
19682 +       printk(KERN_INFO "Dyn: %d iterations of setting & clearing all one "
19683 +                       "flag took %lu jiffies.\n", iterations, end - start);
19684 +
19685 +       start = jiffies;
19686 +
19687 +       for (i = 0; i < iterations; i++) {
19688 +               set_bit(7, &(pfn_to_page(1))->flags);
19689 +               clear_bit(7, &(pfn_to_page(1))->flags);
19690 +       }
19691 +
19692 +       end = jiffies;
19693 +
19694 +       printk(KERN_INFO "Real pageflag: %d iterations of setting & clearing "
19695 +                       "all one flag took %lu jiffies.\n",
19696 +                       iterations, end - start);
19697 +       return 0;
19698 +}
19699 +
19700 +late_initcall(dyn_pageflags_test);
19701 +#endif
19702 +
19703 +static int __init dyn_pageflags_debug_setup(char *str)
19704 +{
19705 +       printk(KERN_INFO "Dynamic pageflags debugging enabled.\n");
19706 +       dyn_pageflags_debug = 1;
19707 +       return 1;
19708 +}
19709 +
19710 +__setup("dyn_pageflags_debug", dyn_pageflags_debug_setup);
19711 diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
19712 index 89fee2d..bb289fd 100644
19713 --- a/mm/memory_hotplug.c
19714 +++ b/mm/memory_hotplug.c
19715 @@ -213,6 +213,8 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
19716         pgdat_resize_unlock(zone->zone_pgdat, &flags);
19717         memmap_init_zone(nr_pages, nid, zone_type,
19718                          phys_start_pfn, MEMMAP_HOTPLUG);
19719 +
19720 +       dyn_pageflags_hotplug(zone);
19721         return 0;
19722  }
19723  
19724 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
19725 index 27b8681..6a707ed 100644
19726 --- a/mm/page_alloc.c
19727 +++ b/mm/page_alloc.c
19728 @@ -1777,6 +1777,26 @@ static unsigned int nr_free_zone_pages(int offset)
19729         return sum;
19730  }
19731  
19732 +static unsigned int nr_unallocated_zone_pages(int offset)
19733 +{
19734 +       struct zoneref *z;
19735 +       struct zone *zone;
19736 +
19737 +       /* Just pick one node, since fallback list is circular */
19738 +       unsigned int sum = 0;
19739 +
19740 +       struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
19741 +
19742 +       for_each_zone_zonelist(zone, z, zonelist, offset) {
19743 +               unsigned long high = zone->pages_high;
19744 +               unsigned long left = zone_page_state(zone, NR_FREE_PAGES);
19745 +               if (left > high)
19746 +                       sum += left - high;
19747 +       }
19748 +
19749 +       return sum;
19750 +}
19751 +
19752  /*
19753   * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
19754   */
19755 @@ -1787,6 +1807,15 @@ unsigned int nr_free_buffer_pages(void)
19756  EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
19757  
19758  /*
19759 + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
19760 + */
19761 +unsigned int nr_unallocated_buffer_pages(void)
19762 +{
19763 +       return nr_unallocated_zone_pages(gfp_zone(GFP_USER));
19764 +}
19765 +EXPORT_SYMBOL_GPL(nr_unallocated_buffer_pages);
19766 +
19767 +/*
19768   * Amount of free RAM allocatable within all zones
19769   */
19770  unsigned int nr_free_pagecache_pages(void)
19771 diff --git a/mm/vmscan.c b/mm/vmscan.c
19772 index 1ff1a58..c02c6e6 100644
19773 --- a/mm/vmscan.c
19774 +++ b/mm/vmscan.c
19775 @@ -811,6 +811,28 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
19776         return nr_taken;
19777  }
19778  
19779 +/* return_lru_pages puts a list of pages back on a zone's lru lists. */
19780 +
19781 +static void return_lru_pages(struct list_head *page_list, struct zone *zone,
19782 +               struct pagevec *pvec)
19783 +{
19784 +       while (!list_empty(page_list)) {
19785 +               struct page *page = lru_to_page(page_list);
19786 +               VM_BUG_ON(PageLRU(page));
19787 +               SetPageLRU(page);
19788 +               list_del(&page->lru);
19789 +               if (PageActive(page))
19790 +                       add_page_to_active_list(zone, page);
19791 +               else
19792 +                       add_page_to_inactive_list(zone, page);
19793 +               if (!pagevec_add(pvec, page)) {
19794 +                       spin_unlock_irq(&zone->lru_lock);
19795 +                       __pagevec_release(pvec);
19796 +                       spin_lock_irq(&zone->lru_lock);
19797 +               }
19798 +       }
19799 +}
19800 +
19801  static unsigned long isolate_pages_global(unsigned long nr,
19802                                         struct list_head *dst,
19803                                         unsigned long *scanned, int order,
19804 @@ -861,7 +883,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
19805         lru_add_drain();
19806         spin_lock_irq(&zone->lru_lock);
19807         do {
19808 -               struct page *page;
19809                 unsigned long nr_taken;
19810                 unsigned long nr_scan;
19811                 unsigned long nr_freed;
19812 @@ -923,21 +944,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
19813                 /*
19814                  * Put back any unfreeable pages.
19815                  */
19816 -               while (!list_empty(&page_list)) {
19817 -                       page = lru_to_page(&page_list);
19818 -                       VM_BUG_ON(PageLRU(page));
19819 -                       SetPageLRU(page);
19820 -                       list_del(&page->lru);
19821 -                       if (PageActive(page))
19822 -                               add_page_to_active_list(zone, page);
19823 -                       else
19824 -                               add_page_to_inactive_list(zone, page);
19825 -                       if (!pagevec_add(&pvec, page)) {
19826 -                               spin_unlock_irq(&zone->lru_lock);
19827 -                               __pagevec_release(&pvec);
19828 -                               spin_lock_irq(&zone->lru_lock);
19829 -                       }
19830 -               }
19831 +               return_lru_pages(&page_list, zone, &pvec);
19832         } while (nr_scanned < max_scan);
19833         spin_unlock(&zone->lru_lock);
19834  done:
19835 @@ -1665,6 +1672,72 @@ out:
19836         return nr_reclaimed;
19837  }
19838  
19839 +struct lru_save {
19840 +       struct zone             *zone;
19841 +       struct list_head        active_list;
19842 +       struct list_head        inactive_list;
19843 +       struct lru_save         *next;
19844 +};
19845 +
19846 +struct lru_save *lru_save_list;
19847 +
19848 +void unlink_lru_lists(void)
19849 +{
19850 +       struct zone *zone;
19851 +
19852 +       for_each_zone(zone) {
19853 +               struct lru_save *this;
19854 +               unsigned long moved, scanned;
19855 +
19856 +               if (!zone->spanned_pages)
19857 +                       continue;
19858 +
19859 +               this = (struct lru_save *)
19860 +                       kzalloc(sizeof(struct lru_save), GFP_ATOMIC);
19861 +
19862 +               BUG_ON(!this);
19863 +
19864 +               this->next = lru_save_list;
19865 +               lru_save_list = this;
19866 +
19867 +               this->zone = zone;
19868 +
19869 +               spin_lock_irq(&zone->lru_lock);
19870 +               INIT_LIST_HEAD(&this->active_list);
19871 +               INIT_LIST_HEAD(&this->inactive_list);
19872 +               moved = isolate_lru_pages(zone_page_state(zone, NR_ACTIVE),
19873 +                               &zone->active_list, &this->active_list,
19874 +                               &scanned, 0, ISOLATE_BOTH);
19875 +               __mod_zone_page_state(zone, NR_ACTIVE, -moved);
19876 +               moved = isolate_lru_pages(zone_page_state(zone, NR_INACTIVE),
19877 +                               &zone->inactive_list, &this->inactive_list,
19878 +                               &scanned, 0, ISOLATE_BOTH);
19879 +               __mod_zone_page_state(zone, NR_INACTIVE, -moved);
19880 +               spin_unlock_irq(&zone->lru_lock);
19881 +       }
19882 +}
19883 +
19884 +void relink_lru_lists(void)
19885 +{
19886 +       while (lru_save_list) {
19887 +               struct lru_save *this = lru_save_list;
19888 +               struct zone *zone = this->zone;
19889 +               struct pagevec pvec;
19890 +
19891 +               pagevec_init(&pvec, 1);
19892 +
19893 +               lru_save_list = this->next;
19894 +
19895 +               spin_lock_irq(&zone->lru_lock);
19896 +               return_lru_pages(&this->active_list, zone, &pvec);
19897 +               return_lru_pages(&this->inactive_list, zone, &pvec);
19898 +               spin_unlock_irq(&zone->lru_lock);
19899 +               pagevec_release(&pvec);
19900 +
19901 +               kfree(this);
19902 +       }
19903 +}
19904 +
19905  /*
19906   * The background pageout daemon, started as a kernel thread
19907   * from the init process. 
19908 @@ -1749,6 +1822,9 @@ void wakeup_kswapd(struct zone *zone, int order)
19909         if (!populated_zone(zone))
19910                 return;
19911  
19912 +       if (freezer_is_on())
19913 +               return;
19914 +
19915         pgdat = zone->zone_pgdat;
19916         if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
19917                 return;
19918 @@ -1762,6 +1838,109 @@ void wakeup_kswapd(struct zone *zone, int order)
19919  }
19920  
19921  #ifdef CONFIG_PM
19922 +static unsigned long shrink_ps1_zone(struct zone *zone,
19923 +       unsigned long total_to_free, struct scan_control sc)
19924 +{
19925 +       unsigned long freed = 0;
19926 +
19927 +       while (total_to_free > freed) {
19928 +               unsigned long nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
19929 +               struct reclaim_state reclaim_state;
19930 +
19931 +               if (nr_slab > total_to_free)
19932 +                       nr_slab = total_to_free;
19933 +
19934 +               reclaim_state.reclaimed_slab = 0;
19935 +               shrink_slab(nr_slab, sc.gfp_mask, nr_slab);
19936 +               if (!reclaim_state.reclaimed_slab)
19937 +                       return freed;
19938 +
19939 +               freed += reclaim_state.reclaimed_slab;
19940 +       }
19941 +
19942 +       return freed;
19943 +}
19944 +
19945 +unsigned long shrink_ps2_zone(struct zone *zone, unsigned long total_to_free,
19946 +               struct scan_control sc)
19947 +{
19948 +       int prio;
19949 +       unsigned long freed = 0;
19950 +       if (!populated_zone(zone) || zone_is_all_unreclaimable(zone))
19951 +               return 0;
19952 +
19953 +       for (prio = DEF_PRIORITY; prio >= 0; prio--) {
19954 +               unsigned long to_free, just_freed, orig_size;
19955 +               unsigned long old_nr_active;
19956 +
19957 +               to_free = min(zone_page_state(zone, NR_ACTIVE) +
19958 +                               zone_page_state(zone, NR_INACTIVE),
19959 +                               total_to_free - freed);
19960 +
19961 +               if (to_free <= 0)
19962 +                       return freed;
19963 +
19964 +               sc.swap_cluster_max = to_free -
19965 +                       zone_page_state(zone, NR_INACTIVE);
19966 +
19967 +               do {
19968 +                       old_nr_active = zone_page_state(zone, NR_ACTIVE);
19969 +                       zone->nr_scan_active = sc.swap_cluster_max - 1;
19970 +                       shrink_active_list(sc.swap_cluster_max, zone, &sc,
19971 +                                       prio);
19972 +                       zone->nr_scan_active = 0;
19973 +
19974 +                       sc.swap_cluster_max = to_free - zone_page_state(zone,
19975 +                                       NR_INACTIVE);
19976 +
19977 +               } while (sc.swap_cluster_max > 0 &&
19978 +                        zone_page_state(zone, NR_ACTIVE) > old_nr_active);
19979 +
19980 +               to_free = min(zone_page_state(zone, NR_ACTIVE) +
19981 +                               zone_page_state(zone, NR_INACTIVE),
19982 +                               total_to_free - freed);
19983 +
19984 +               do {
19985 +                       orig_size = zone_page_state(zone, NR_ACTIVE) +
19986 +                               zone_page_state(zone, NR_INACTIVE);
19987 +                       zone->nr_scan_inactive = to_free;
19988 +                       sc.swap_cluster_max = to_free;
19989 +                       shrink_inactive_list(to_free, zone, &sc);
19990 +                       just_freed = (orig_size -
19991 +                               (zone_page_state(zone, NR_ACTIVE) +
19992 +                                zone_page_state(zone, NR_INACTIVE)));
19993 +                       zone->nr_scan_inactive = 0;
19994 +                       freed += just_freed;
19995 +               } while (just_freed > 0 && freed < total_to_free);
19996 +       }
19997 +
19998 +       return freed;
19999 +}
20000 +
20001 +void shrink_one_zone(struct zone *zone, unsigned long total_to_free,
20002 +               int ps_wanted)
20003 +{
20004 +       unsigned long freed = 0;
20005 +       struct scan_control sc = {
20006 +               .gfp_mask = GFP_KERNEL,
20007 +               .may_swap = 0,
20008 +               .may_writepage = 1,
20009 +               .swappiness = vm_swappiness,
20010 +               .isolate_pages = isolate_pages_global,
20011 +       };
20012 +
20013 +       if (total_to_free <= 0)
20014 +               return;
20015 +
20016 +       if (is_highmem(zone))
20017 +               sc.gfp_mask |= __GFP_HIGHMEM;
20018 +
20019 +       if (ps_wanted & 2)
20020 +               freed = shrink_ps2_zone(zone, total_to_free, sc);
20021 +       if (ps_wanted & 1)
20022 +               shrink_ps1_zone(zone, total_to_free - freed, sc);
20023 +}
20024 +
20025  /*
20026   * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
20027   * from LRU lists system-wide, for given pass and priority, and returns the
This page took 1.503366 seconds and 3 git commands to generate.